From 66ce1d3d36349015244f9329e088e80524b9786d Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Thu, 12 Jun 2025 12:28:49 +0700 Subject: [PATCH 1/4] Add Vietnamese text normalization for cardinal semiotic class Signed-off-by: folivoramanh --- .../text_normalization/normalize.py | 3 + .../text_normalization/vi/__init__.py | 13 + .../vi/data/numbers/__init__.py | 0 .../vi/data/numbers/digit.tsv | 9 + .../vi/data/numbers/digit_special.tsv | 3 + .../vi/data/numbers/teen.tsv | 10 + .../vi/data/numbers/ties.tsv | 8 + .../vi/data/numbers/units.tsv | 5 + .../vi/data/numbers/zero.tsv | 1 + .../text_normalization/vi/data/whitelist.tsv | 0 .../text_normalization/vi/taggers/__init__.py | 13 + .../text_normalization/vi/taggers/cardinal.py | 282 ++++++++++++++++++ .../vi/taggers/punctuation.py | 57 ++++ .../vi/taggers/tokenize_and_classify.py | 92 ++++++ .../vi/taggers/whitelist.py | 70 +++++ .../text_normalization/vi/taggers/word.py | 34 +++ .../text_normalization/vi/utils.py | 42 +++ .../vi/verbalizers/__init__.py | 13 + .../vi/verbalizers/cardinal.py | 55 ++++ .../vi/verbalizers/verbalize.py | 38 +++ .../vi/verbalizers/verbalize_final.py | 72 +++++ .../vi/verbalizers/whitelist.py | 42 +++ .../text_normalization/vi/verbalizers/word.py | 37 +++ .../test_cases_cardinal.txt | 107 +++++++ .../nemo_text_processing/vi/test_cardinal.py | 45 ++- ..._sparrowhawk_inverse_text_normalization.sh | 2 +- .../vi/test_sparrowhawk_normalization.sh | 77 +++++ .../pynini_export.py | 6 +- 28 files changed, 1119 insertions(+), 17 deletions(-) create mode 100644 nemo_text_processing/text_normalization/vi/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/units.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/whitelist.tsv create mode 100644 nemo_text_processing/text_normalization/vi/taggers/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/taggers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/vi/taggers/punctuation.py create mode 100644 nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/text_normalization/vi/taggers/whitelist.py create mode 100644 nemo_text_processing/text_normalization/vi/taggers/word.py create mode 100644 nemo_text_processing/text_normalization/vi/utils.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/word.py create mode 100644 tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 82f8f43d2..329b28338 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -174,6 +174,9 @@ def __init__( elif lang == 'ja': from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'vi': + from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py new file mode 100644 index 000000000..602b8a347 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv new file mode 100644 index 000000000..573c20bd4 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +1 một +2 hai +3 ba +4 bốn +5 năm +6 sáu +7 bảy +8 tám +9 chín \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv new file mode 100644 index 000000000..919baaf6e --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv @@ -0,0 +1,3 @@ +1 một mốt +4 bốn tư +5 năm lăm \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv new file mode 100644 index 000000000..8d99f8a69 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv @@ -0,0 +1,10 @@ +10 mười +11 mười một +12 mười hai +13 mười ba +14 mười bốn +15 mười lăm +16 mười sáu +17 mười bảy +18 mười tám +19 mười chín \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv new file mode 100644 index 000000000..da88b8ab8 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv @@ -0,0 +1,8 @@ +2 hai mươi +3 ba mươi +4 bốn mươi +5 năm mươi +6 sáu mươi +7 bảy mươi +8 tám mươi +9 chín mươi \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv new file mode 100644 index 000000000..c8a08083c --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv @@ -0,0 +1,5 @@ +thousand nghìn +million triệu +billion tỷ +hundred trăm +linh linh \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv new file mode 100644 index 000000000..df062e38c --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv @@ -0,0 +1 @@ +0 không \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/whitelist.tsv b/nemo_text_processing/text_normalization/vi/data/whitelist.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py new file mode 100644 index 000000000..602b8a347 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py new file mode 100644 index 000000000..fdd3cae82 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -0,0 +1,282 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space +from nemo_text_processing.text_normalization.vi.utils import get_abs_path + + +def load_data_map(filename): + """Load TSV data as pynini string map.""" + mappings = [] + with open(get_abs_path(f"data/numbers/{filename}"), 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + parts = line.split('\t') + if len(parts) >= 2: + mappings.append((parts[0], parts[1])) + return pynini.string_map(mappings) + + +class CardinalFst(GraphFst): + """ + Simplified Vietnamese cardinal FST using recursive pattern building. + Reduced from 700+ lines to ~200 lines while maintaining full functionality. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + # Load all basic data maps + zero = load_data_map("zero.tsv") + digit = load_data_map("digit.tsv") + teen = load_data_map("teen.tsv") + ties = load_data_map("ties.tsv") + + # Load units as dict for easy access + units = {} + with open(get_abs_path("data/numbers/units.tsv"), 'r', encoding='utf-8') as f: + for line in f: + parts = line.strip().split('\t') + if len(parts) == 2: + units[parts[0]] = parts[1] + + # Load special digits (contextual variants) + special = {} + with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f: + for line in f: + parts = line.strip().split('\t') + if len(parts) >= 3: + special[parts[0]] = {'std': parts[1], 'alt': parts[2]} + + # Build core patterns + single_digit = digit + + # Special digits for specific contexts (X1, X4, X5 → mốt, tư, lăm) + special_1 = pynini.cross("1", special["1"]["alt"]) # mốt + special_4 = pynini.cross("4", special["4"]["alt"]) # tư + special_5 = pynini.cross("5", special["5"]["alt"]) # lăm + + # Linh digits (for 0X patterns) - use standard forms + linh_digits = pynini.union( + pynini.cross("1", special["1"]["std"]), # một + pynini.cross("4", special["4"]["std"]), # bốn + pynini.cross("5", special["5"]["std"]), # năm + digit + ) + + # Two digit patterns + two_digit = pynini.union( + teen, # 10-19 + ties + pynutil.delete("0"), # 20, 30, etc. + ties + insert_space + pynini.union( + special_1, special_4, special_5, # X1, X4, X5 cases + pynini.union("2", "3", "6", "7", "8", "9") @ digit # other digits + ) + ) + + # Build hundreds (3 digits: 100-999) + hundreds_base = pynini.union( + single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("00"), + single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("0") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + single_digit + insert_space + pynutil.insert(units["hundred"]) + insert_space + two_digit + ) + hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ hundreds_base + + # Build thousands (4-6 digits) with explicit digit constraints + # 4-digit thousands (1000-9999) + thousands_4d = pynini.union( + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"), + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") + + insert_space + two_digit, + single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base + ) + + # 5-digit thousands (10000-99999) + thousands_5d = pynini.union( + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"), + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") + + insert_space + two_digit, + two_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base + ) + + # 6-digit thousands (100000-999999) + thousands_6d = pynini.union( + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"), + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") + + insert_space + two_digit, + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base + ) + + thousands = pynini.union( + pynini.closure(NEMO_DIGIT, 6, 6) @ thousands_6d, + pynini.closure(NEMO_DIGIT, 5, 5) @ thousands_5d, + pynini.closure(NEMO_DIGIT, 4, 4) @ thousands_4d + ) + + # Build millions (7-9 digits) with explicit patterns to fix precedence + # 7-digit millions (1000000-9999999) + millions_7d = pynini.union( + # Exact millions: 1000000, 2000000, etc. + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"), + # Millions with linh: 1000001, 1000002, etc. + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + # Millions with tens: 1000010, 1000020, etc. + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000") + + insert_space + two_digit, + # Millions with hundreds: 1000100, 1000200, etc. + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000") + + insert_space + hundreds_base, + # Millions with thousands: 5500000 -> năm triệu năm trăm nghìn + single_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d, + # Complex millions: X001YYY -> X triệu một nghìn YYY (critical fix for 1001001) + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00") + + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + # Complex millions: X0YZWWW -> X triệu YZ nghìn WWW (critical fix for 1050003) + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0") + + insert_space + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + # Full millions: X123YZW -> X triệu YZW nghìn/trăm/etc (1050003) + single_digit + insert_space + pynutil.insert(units["million"]) + insert_space + + pynini.closure(NEMO_DIGIT, 3, 3) @ ( + pynini.union( + # YZW000 patterns - invalid for 6 digits, skip + # YZ0ABC patterns + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + # YZ0ABC patterns with tens + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") + + insert_space + two_digit, + # YYZABC patterns with hundreds + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base, + # 0YYZABC patterns (hundreds only) + pynutil.delete("0") + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base, + # 00YABC patterns (tens only) + pynutil.delete("00") + hundreds_base, + # Y00ABC patterns (single thousand) + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + # YZ00AB patterns (tens of thousands) + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000") + ) + ) + ) + + # 8-digit millions (10000000-99999999) + millions_8d = pynini.union( + two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"), + two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000") + + insert_space + two_digit, + two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000") + + insert_space + hundreds_base, + two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d, + two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d, + two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d + ) + + # 9-digit millions (100000000-999999999) + millions_9d = pynini.union( + hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"), + hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000") + + insert_space + two_digit, + hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000") + + insert_space + hundreds_base, + hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d, + hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d, + hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d + ) + + millions = pynini.union( + pynini.closure(NEMO_DIGIT, 9, 9) @ millions_9d, + pynini.closure(NEMO_DIGIT, 8, 8) @ millions_8d, + pynini.closure(NEMO_DIGIT, 7, 7) @ millions_7d + ) + + # Build billions (10-12 digits) with explicit patterns + # 10-digit billions (1000000000-9999999999) + billions_10d = pynini.union( + single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"), + single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00000000") + + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, + single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("0000000") + + insert_space + two_digit, + single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000") + + insert_space + hundreds_base, + # Complex billions: 1001001101 -> một tỷ một triệu một nghìn một trăm linh một + single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00") + + insert_space + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00") + + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base, + # Full billions with millions + single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d, + single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d, + single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d + ) + + # 11-digit billions (10000000000-99999999999) + billions_11d = pynini.union( + two_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"), + two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d, + two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d, + two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d + ) + + # 12-digit billions (100000000000-999999999999) + billions_12d = pynini.union( + hundreds_base + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"), + hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d, + hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d, + hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d + ) + + billions = pynini.union( + pynini.closure(NEMO_DIGIT, 12, 12) @ billions_12d, + pynini.closure(NEMO_DIGIT, 11, 11) @ billions_11d, + pynini.closure(NEMO_DIGIT, 10, 10) @ billions_10d + ) + + # Combine all patterns with proper precedence (longest first) + self.graph = pynini.union( + billions, # 10-12 digits + millions, # 7-9 digits + thousands, # 4-6 digits + hundreds, # 3 digits + two_digit, # 2 digits + single_digit, # 1 digit + zero # 0 + ).optimize() + + # For decimal usage + self.single_digits_graph = single_digit | zero + self.graph_with_and = self.graph + + # Build final FST with negative handling + optional_minus = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + final_graph = optional_minus + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + self.fst = self.add_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py new file mode 100644 index 000000000..f0d20918e --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation for Vietnamese + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="punctuation", kind="classify", deterministic=deterministic) + + # Common punctuation marks + # Use escape() for brackets since they are special regex chars + punct = ( + pynini.accep(".") + | pynini.accep(",") + | pynini.accep("?") + | pynini.accep("!") + | pynini.accep(":") + | pynini.accep(";") + | pynini.accep("-") + | pynini.accep("–") + | pynini.accep("—") + | pynini.accep("(") + | pynini.accep(")") + | pynini.accep(pynini.escape("[")) + | pynini.accep(pynini.escape("]")) + | pynini.accep(pynini.escape("{")) + | pynini.accep(pynini.escape("}")) + | pynini.accep('"') + | pynini.accep("'") + | pynini.accep("...") + | pynini.accep("…") + ) + + # Create the punctuation transduction + graph = pynutil.insert('name: "') + punct + pynutil.insert('"') + + final_graph = pynutil.insert("punctuation { ") + graph + pynutil.insert(" }") + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..52038b14d --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.vi.taggers.word import WordFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + def __init__( + self, + input_case: str, + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"vi_tn_{deterministic}_deterministic_{input_case}_tokenize.far",) + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating Vietnamese ClassifyFst grammars.") + + start_time = time.time() + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + logger.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes") + + start_time = time.time() + punctuation = PunctuationFst(deterministic=deterministic) + punct_graph = punctuation.fst + logger.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") + + start_time = time.time() + whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic) + whitelist_graph = whitelist.fst + logger.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes") + + start_time = time.time() + word_graph = WordFst(deterministic=deterministic).fst + logger.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") + + classify = ( + pynutil.add_weight(whitelist_graph, 0.8) + | pynutil.add_weight(cardinal_graph, 0.9) + | pynutil.add_weight(word_graph, 100) + ) + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py new file mode 100644 index 000000000..5ffd7732e --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelist for Vietnamese, e.g. + "h" -> tokens { name: "giờ" } + "p" -> tokens { name: "phút" } + "s" -> tokens { name: "giây" } + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + input_file: path to a file with whitelist replacements + """ + + def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): + super().__init__(name="whitelist", kind="classify", deterministic=deterministic) + + def _get_whitelist_graph(input_case, file): + whitelist = load_labels(file) + if input_case == "lower_cased": + whitelist = [[x[0].lower()] + x[1:] for x in whitelist] + graph = pynini.string_map(whitelist) + return graph + + graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) + if not deterministic and input_case != "lower_cased": + graph |= pynutil.add_weight( + _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001 + ) + + if input_file: + whitelist_provided = _get_whitelist_graph(input_case, input_file) + if not deterministic: + graph |= whitelist_provided + else: + graph = whitelist_provided + + # Add time units from time_units.tsv for better time handling + if not deterministic: + time_units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/time/time_units.tsv")) + graph |= time_units_graph + + self.graph = graph + self.final_graph = convert_space(self.graph).optimize() + self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize() + + # Add tokens wrapper + self.fst = self.add_tokens(self.fst) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py new file mode 100644 index 000000000..c3e6d408e --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying Vietnamese words. + e.g. ngày -> name: "ngày" + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="classify", deterministic=deterministic) + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/utils.py b/nemo_text_processing/text_normalization/vi/utils.py new file mode 100644 index 000000000..332330921 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/utils.py @@ -0,0 +1,42 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py new file mode 100644 index 000000000..602b8a347 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py new file mode 100644 index 000000000..5ca695673 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing Vietnamese cardinal numbers, e.g. + cardinal { negative: "true" integer: "hai mươi ba" } -> âm hai mươi ba + cardinal { integer: "một trăm" } -> một trăm + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + # Handle negative sign - Vietnamese uses "âm" for negative numbers + self.optional_sign = pynini.cross("negative: \"true\"", "âm ") + if not deterministic: + # Alternative ways to say negative in Vietnamese + self.optional_sign |= pynini.cross("negative: \"true\"", "trừ ") + self.optional_sign |= pynini.cross("negative: \"true\"", "âm ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + # Handle the integer part + integer = pynini.closure(NEMO_NOT_QUOTE) + + self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"") + integer = pynutil.delete("integer:") + self.integer + + # Combine negative sign with integer + self.numbers = self.optional_sign + integer + + # Delete the token structure and create final FST + delete_tokens = self.delete_tokens(self.numbers) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py new file mode 100644 index 000000000..772b2b5f5 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + # Initialize verbalizers + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + + whitelist = WhiteListFst(deterministic=deterministic) + whitelist_graph = whitelist.fst + + word = WordFst(deterministic=deterministic) + word_graph = word.fst + + # Combine all verbalizers + graph = cardinal_graph | whitelist_graph | word_graph + + self.fst = graph \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py new file mode 100644 index 000000000..a049a5796 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire Vietnamese sentence, e.g. + tokens { name: "xin" } tokens { cardinal { integer: "một trăm" } } tokens { name: "chào" } + -> xin một trăm chào + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"vi_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst + + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py new file mode 100644 index 000000000..0b77ee498 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py @@ -0,0 +1,42 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist for Vietnamese + e.g. tokens { name: "giờ" } -> giờ + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py new file mode 100644 index 000000000..3ad9a1a82 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing Vietnamese words. + e.g. tokens { name: "ngày" } -> ngày + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..aad7ae8c1 --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,107 @@ +1~một +2~hai +3~ba +4~bốn +5~năm +6~sáu +7~bảy +8~tám +9~chín +10~mười +11~mười một +12~mười hai +15~mười lăm +18~mười tám +19~mười chín +20~hai mươi +21~hai mươi mốt +25~hai mươi lăm +30~ba mươi +34~ba mươi tư +44~bốn mươi tư +55~năm mươi lăm +67~sáu mươi bảy +70~bảy mươi +80~tám mươi +95~chín mươi lăm +100~một trăm +101~một trăm linh một +102~một trăm linh hai +104~một trăm linh bốn +105~một trăm linh năm +110~một trăm mười +111~một trăm mười một +120~một trăm hai mươi +123~một trăm hai mươi ba +200~hai trăm +201~hai trăm linh một +500~năm trăm +999~chín trăm chín mươi chín +1000~một nghìn +1001~một nghìn linh một +1020~một nghìn hai mươi +1095~một nghìn chín mươi lăm +1100~một nghìn một trăm +2000~hai nghìn +10000~mười nghìn +100000~một trăm nghìn +1000000~một triệu +2000000~hai triệu +1000000000~một tỷ +-1~âm một +-25~âm hai mươi lăm +-100~âm một trăm +-1000~âm một nghìn +0~không +1000~một nghìn +1001~một nghìn linh một +101~một trăm linh một +104~một trăm linh bốn +105~một trăm linh năm +24~hai mươi tư +35~ba mươi lăm +41~bốn mươi mốt +55~năm mươi lăm +91~chín mươi mốt +14~mười bốn +16~mười sáu +17~mười bảy +37~ba mươi bảy +47~bốn mươi bảy +57~năm mươi bảy +63~sáu mươi ba +79~bảy mươi chín +84~tám mươi tư +98~chín mươi tám +-123~âm một trăm hai mươi ba +-1001~âm một nghìn linh một +-104~âm một trăm linh bốn +1000001~một triệu linh một +1001001~một triệu một nghìn linh một +1050003~một triệu năm mươi nghìn linh ba +1000000001~một tỷ linh một +1001001101~một tỷ một triệu một nghìn một trăm linh một +300~ba trăm +400~bốn trăm +500~năm trăm +6000~sáu nghìn +7000~bảy nghìn +15000~mười lăm nghìn +300000~ba trăm nghìn +450000~bốn trăm năm mươi nghìn +5000000~năm triệu +700000000~bảy trăm triệu +31~ba mươi mốt +41~bốn mươi mốt +51~năm mươi mốt +61~sáu mươi mốt +71~bảy mươi mốt +81~tám mươi mốt +91~chín mươi mốt +5500000~năm triệu năm trăm nghìn +1000010~một triệu mười +1000100~một triệu một trăm +1000101~một triệu một trăm linh một +1010001~một triệu mười nghìn linh một +10000000000~mười tỷ +150~một trăm năm mươi \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_cardinal.py b/tests/nemo_text_processing/vi/test_cardinal.py index 0a888f84b..b745b1e09 100644 --- a/tests/nemo_text_processing/vi/test_cardinal.py +++ b/tests/nemo_text_processing/vi/test_cardinal.py @@ -12,32 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. +# pytest tests/nemo_text_processing/vi/test_cardinal.py --cpu --cache-clear import pytest from parameterized import parameterized -from ..utils import CACHE_DIR, parse_test_case_file +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -try: - from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - - PYNINI_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - PYNINI_AVAILABLE = False +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestCardinal: - inverse_normalizer = ( - InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None - ) - + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_cardinal.txt')) - @pytest.mark.skipif( - not PYNINI_AVAILABLE, - reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", - ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) + + normalizer_with_audio = ( + NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + if CACHE_DIR and RUN_AUDIO_BASED_TESTS + else None + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + + if self.normalizer_with_audio: + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, + n_tagged=30, + punct_post_process=False, + ) + assert expected in pred_non_deterministic, f"input: {test_input}" \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh index 751351cd4..684eb3b22 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests"} runtest () { input=$1 diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..d230b4642 --- /dev/null +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh @@ -0,0 +1,77 @@ + +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + norm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + norm_pred="$(echo -e "${norm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$norm_pred" + done < "$input" +} + +testTNCardinal() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +# testTNDate() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_date.txt +# runtest $input +# } + +# testTNDecimal() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_decimal.txt +# runtest $input +# } + +# testTNOrdinal() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_ordinal.txt +# runtest $input +# } + +# testTNFraction() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_fraction.txt +# runtest $input +# } + +# testTNTime() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_time.txt +# runtest $input +# } + +# testTNMeasure() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_measure.txt +# runtest $input +# } + +# testTNMoney() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_money.txt +# runtest $input +# } + +# testTNTelephone() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_telephone.txt +# runtest $input +# } + +# testTNElectronic() { +# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_electronic.txt +# runtest $input +# } + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..bc19f428d 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -137,7 +137,7 @@ def parse_args(): if __name__ == '__main__': args = parse_args() - if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars': + if args.language in ['pt', 'ru', 'es_en', 'mr'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') TNPostProcessingFst = None ITNPostProcessingFst = None @@ -240,6 +240,10 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'zh': from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, From 2df93bc76f2400496ed58e792fcbcfa6cac1e633 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Thu, 12 Jun 2025 12:42:36 +0700 Subject: [PATCH 2/4] Add missing init file Signed-off-by: folivoramanh --- .../text_normalization/vi/data/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nemo_text_processing/text_normalization/vi/data/__init__.py diff --git a/nemo_text_processing/text_normalization/vi/data/__init__.py b/nemo_text_processing/text_normalization/vi/data/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 109d071d326e97bb47233d67b73762ce7744e87c Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Thu, 19 Jun 2025 00:00:34 +0700 Subject: [PATCH 3/4] Fix Cardinal and optimize logic Signed-off-by: folivoramanh --- Jenkinsfile | 7 +- .../vi/data/numbers/__init__.py | 13 + .../numbers/{units.tsv => magnitudes.tsv} | 0 .../text_normalization/vi/taggers/cardinal.py | 339 +++++------------- .../vi/taggers/punctuation.py | 25 +- .../vi/taggers/tokenize_and_classify.py | 1 - 6 files changed, 119 insertions(+), 266 deletions(-) rename nemo_text_processing/text_normalization/vi/data/numbers/{units.tsv => magnitudes.tsv} (100%) diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..2f9ca394d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -176,7 +176,7 @@ pipeline { } } - stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') { + stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') { when { anyOf { branch 'main' @@ -200,6 +200,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}' } } + stage('L0: VI TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}' + } + } stage('L0: HU TN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}' diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py index e69de29bb..6ebc808fa 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv similarity index 100% rename from nemo_text_processing/text_normalization/vi/data/numbers/units.tsv rename to nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index fdd3cae82..99fa76acd 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -19,264 +19,119 @@ from nemo_text_processing.text_normalization.vi.utils import get_abs_path -def load_data_map(filename): - """Load TSV data as pynini string map.""" - mappings = [] - with open(get_abs_path(f"data/numbers/{filename}"), 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - parts = line.split('\t') - if len(parts) >= 2: - mappings.append((parts[0], parts[1])) - return pynini.string_map(mappings) - - class CardinalFst(GraphFst): - """ - Simplified Vietnamese cardinal FST using recursive pattern building. - Reduced from 700+ lines to ~200 lines while maintaining full functionality. - """ - def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) - - # Load all basic data maps - zero = load_data_map("zero.tsv") - digit = load_data_map("digit.tsv") - teen = load_data_map("teen.tsv") - ties = load_data_map("ties.tsv") - # Load units as dict for easy access - units = {} - with open(get_abs_path("data/numbers/units.tsv"), 'r', encoding='utf-8') as f: - for line in f: - parts = line.strip().split('\t') - if len(parts) == 2: - units[parts[0]] = parts[1] - - # Load special digits (contextual variants) - special = {} - with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f: - for line in f: - parts = line.strip().split('\t') - if len(parts) >= 3: - special[parts[0]] = {'std': parts[1], 'alt': parts[2]} - - # Build core patterns - single_digit = digit + resources = { + 'zero': pynini.string_file(get_abs_path("data/numbers/zero.tsv")), + 'digit': pynini.string_file(get_abs_path("data/numbers/digit.tsv")), + 'teen': pynini.string_file(get_abs_path("data/numbers/teen.tsv")), + 'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + } + self.zero, self.digit, self.teen, self.ties = resources.values() - # Special digits for specific contexts (X1, X4, X5 → mốt, tư, lăm) - special_1 = pynini.cross("1", special["1"]["alt"]) # mốt - special_4 = pynini.cross("4", special["4"]["alt"]) # tư - special_5 = pynini.cross("5", special["5"]["alt"]) # lăm + with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f: + self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2} - # Linh digits (for 0X patterns) - use standard forms - linh_digits = pynini.union( - pynini.cross("1", special["1"]["std"]), # một - pynini.cross("4", special["4"]["std"]), # bốn - pynini.cross("5", special["5"]["std"]), # năm - digit - ) + with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f: + special = {parts[0]: {'std': parts[1], 'alt': parts[2]} for line in f + if len(parts := line.strip().split('\t')) >= 3} - # Two digit patterns - two_digit = pynini.union( - teen, # 10-19 - ties + pynutil.delete("0"), # 20, 30, etc. - ties + insert_space + pynini.union( - special_1, special_4, special_5, # X1, X4, X5 cases - pynini.union("2", "3", "6", "7", "8", "9") @ digit # other digits - ) - ) - - # Build hundreds (3 digits: 100-999) - hundreds_base = pynini.union( - single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("00"), - single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("0") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - single_digit + insert_space + pynutil.insert(units["hundred"]) + insert_space + two_digit - ) - hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ hundreds_base - - # Build thousands (4-6 digits) with explicit digit constraints - # 4-digit thousands (1000-9999) - thousands_4d = pynini.union( - single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"), - single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") - + insert_space + two_digit, - single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base - ) + self.special_digits = pynini.union(*[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]]) + self.linh_digits = pynini.union(*[pynini.cross(k, special[k]["std"]) for k in ["1", "4", "5"]], self.digit) - # 5-digit thousands (10000-99999) - thousands_5d = pynini.union( - two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"), - two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") - + insert_space + two_digit, - two_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base - ) + self.single_digit = self.digit - # 6-digit thousands (100000-999999) - thousands_6d = pynini.union( - hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"), - hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") - + insert_space + two_digit, - hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base - ) - - thousands = pynini.union( - pynini.closure(NEMO_DIGIT, 6, 6) @ thousands_6d, - pynini.closure(NEMO_DIGIT, 5, 5) @ thousands_5d, - pynini.closure(NEMO_DIGIT, 4, 4) @ thousands_4d - ) - - # Build millions (7-9 digits) with explicit patterns to fix precedence - # 7-digit millions (1000000-9999999) - millions_7d = pynini.union( - # Exact millions: 1000000, 2000000, etc. - single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"), - # Millions with linh: 1000001, 1000002, etc. - single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - # Millions with tens: 1000010, 1000020, etc. - single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000") - + insert_space + two_digit, - # Millions with hundreds: 1000100, 1000200, etc. - single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000") - + insert_space + hundreds_base, - # Millions with thousands: 5500000 -> năm triệu năm trăm nghìn - single_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d, - # Complex millions: X001YYY -> X triệu một nghìn YYY (critical fix for 1001001) - single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00") - + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - # Complex millions: X0YZWWW -> X triệu YZ nghìn WWW (critical fix for 1050003) - single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0") - + insert_space + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - # Full millions: X123YZW -> X triệu YZW nghìn/trăm/etc (1050003) - single_digit + insert_space + pynutil.insert(units["million"]) + insert_space - + pynini.closure(NEMO_DIGIT, 3, 3) @ ( - pynini.union( - # YZW000 patterns - invalid for 6 digits, skip - # YZ0ABC patterns - two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - # YZ0ABC patterns with tens - two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") - + insert_space + two_digit, - # YYZABC patterns with hundreds - hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base, - # 0YYZABC patterns (hundreds only) - pynutil.delete("0") + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base, - # 00YABC patterns (tens only) - pynutil.delete("00") + hundreds_base, - # Y00ABC patterns (single thousand) - single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - # YZ00AB patterns (tens of thousands) - two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000") - ) + self.two_digit = pynini.union( + self.teen, + self.ties + pynutil.delete("0"), + self.ties + insert_space + pynini.union( + self.special_digits, + pynini.union("2", "3", "6", "7", "8", "9") @ self.digit ) ) - # 8-digit millions (10000000-99999999) - millions_8d = pynini.union( - two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"), - two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000") - + insert_space + two_digit, - two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000") - + insert_space + hundreds_base, - two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d, - two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d, - two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d + self.hundreds_pattern = pynini.union( + self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("00"), + + self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("0") + + insert_space + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits, + + self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + insert_space + self.two_digit ) - # 9-digit millions (100000000-999999999) - millions_9d = pynini.union( - hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"), - hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000") - + insert_space + two_digit, - hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000") - + insert_space + hundreds_base, - hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d, - hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d, - hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d - ) - - millions = pynini.union( - pynini.closure(NEMO_DIGIT, 9, 9) @ millions_9d, - pynini.closure(NEMO_DIGIT, 8, 8) @ millions_8d, - pynini.closure(NEMO_DIGIT, 7, 7) @ millions_7d - ) - - # Build billions (10-12 digits) with explicit patterns - # 10-digit billions (1000000000-9999999999) - billions_10d = pynini.union( - single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"), - single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00000000") - + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits, - single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("0000000") - + insert_space + two_digit, - single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000") - + insert_space + hundreds_base, - # Complex billions: 1001001101 -> một tỷ một triệu một nghìn một trăm linh một - single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00") - + insert_space + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00") - + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base, - # Full billions with millions - single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d, - single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d, - single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d - ) + self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern - # 11-digit billions (10000000000-99999999999) - billions_11d = pynini.union( - two_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"), - two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d, - two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d, - two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d - ) + self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3) + self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand) + self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million) - # 12-digit billions (100000000000-999999999999) - billions_12d = pynini.union( - hundreds_base + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"), - hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d, - hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d, - hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d - ) - - billions = pynini.union( - pynini.closure(NEMO_DIGIT, 12, 12) @ billions_12d, - pynini.closure(NEMO_DIGIT, 11, 11) @ billions_11d, - pynini.closure(NEMO_DIGIT, 10, 10) @ billions_10d - ) - - # Combine all patterns with proper precedence (longest first) self.graph = pynini.union( - billions, # 10-12 digits - millions, # 7-9 digits - thousands, # 4-6 digits - hundreds, # 3 digits - two_digit, # 2 digits - single_digit, # 1 digit - zero # 0 + self.billion, self.million, self.thousand, self.hundreds, + self.two_digit, self.single_digit, self.zero ).optimize() - - # For decimal usage - self.single_digits_graph = single_digit | zero + + self.single_digits_graph = self.single_digit | self.zero self.graph_with_and = self.graph - - # Build final FST with negative handling - optional_minus = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) - final_graph = optional_minus + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") - self.fst = self.add_tokens(final_graph).optimize() \ No newline at end of file + + self.fst = self.add_tokens( + pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + ).optimize() + + def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None): + magnitude_word = self.magnitudes[name] + + patterns = [] + for digits in range(min_digits, max_digits + 1): + leading_digits = digits - zero_count + leading_fst = {1: self.single_digit, 2: self.two_digit, 3: self.hundreds_pattern}.get( + leading_digits, self.hundreds_pattern) + + prefix = leading_fst + insert_space + pynutil.insert(magnitude_word) + + digit_patterns = [prefix + pynutil.delete("0" * zero_count)] + + if prev_pattern: + digit_patterns.append(prefix + insert_space + prev_pattern) + + trailing_patterns = [] + for trailing_zeros in range(zero_count): + remaining_digits = zero_count - trailing_zeros + if remaining_digits == 1: + trailing_patterns.append( + prefix + pynutil.delete("0" * trailing_zeros) + insert_space + + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits + ) + elif remaining_digits == 2: + trailing_patterns.append( + prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.two_digit + ) + elif remaining_digits == 3: + trailing_patterns.append( + prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.hundreds_pattern + ) + digit_patterns.extend(trailing_patterns) + + if name == "million" and digits == 7: + digit_patterns.extend([ + prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + + pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits, + + prefix + pynutil.delete("0") + insert_space + self.two_digit + insert_space + + pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits + ]) + elif name == "billion" and digits == 10: + digit_patterns.append( + prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + + pynutil.insert(self.magnitudes["million"]) + pynutil.delete("00") + insert_space + + self.single_digit + insert_space + pynutil.insert(self.magnitudes["thousand"]) + + insert_space + self.hundreds_pattern + ) + + patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns)) + + return pynini.union(*patterns) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py index f0d20918e..044c6494c 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst class PunctuationFst(GraphFst): @@ -28,27 +28,8 @@ def __init__(self, deterministic: bool = True): # Common punctuation marks # Use escape() for brackets since they are special regex chars - punct = ( - pynini.accep(".") - | pynini.accep(",") - | pynini.accep("?") - | pynini.accep("!") - | pynini.accep(":") - | pynini.accep(";") - | pynini.accep("-") - | pynini.accep("–") - | pynini.accep("—") - | pynini.accep("(") - | pynini.accep(")") - | pynini.accep(pynini.escape("[")) - | pynini.accep(pynini.escape("]")) - | pynini.accep(pynini.escape("{")) - | pynini.accep(pynini.escape("}")) - | pynini.accep('"') - | pynini.accep("'") - | pynini.accep("...") - | pynini.accep("…") - ) + s = "!#$%&'()*+,-./:;<=>?@^_`{|}~–—――…»«„“›‹‚‘’⟨⟩" + punct = pynini.union(*s) # Create the punctuation transduction graph = pynutil.insert('name: "') + punct + pynutil.insert('"') diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 52038b14d..004e51c9d 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -19,7 +19,6 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_WHITE_SPACE, GraphFst, delete_extra_space, delete_space, From d751b36114ef08c9ef6a30b23a81e2cf54b58d30 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Jun 2025 17:13:25 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/vi/__init__.py | 2 +- .../text_normalization/vi/taggers/__init__.py | 2 +- .../text_normalization/vi/taggers/cardinal.py | 152 +++++++++++------- .../vi/taggers/punctuation.py | 2 +- .../vi/taggers/tokenize_and_classify.py | 13 +- .../vi/taggers/whitelist.py | 4 +- .../text_normalization/vi/taggers/word.py | 2 +- .../vi/verbalizers/__init__.py | 2 +- .../vi/verbalizers/cardinal.py | 2 +- .../vi/verbalizers/verbalize.py | 4 +- .../vi/verbalizers/verbalize_final.py | 4 +- .../vi/verbalizers/whitelist.py | 2 +- .../text_normalization/vi/verbalizers/word.py | 2 +- .../nemo_text_processing/vi/test_cardinal.py | 12 +- 14 files changed, 127 insertions(+), 78 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py index 602b8a347..bc443be41 100644 --- a/nemo_text_processing/text_normalization/vi/__init__.py +++ b/nemo_text_processing/text_normalization/vi/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py index 602b8a347..bc443be41 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index 99fa76acd..fa0f04fad 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -22,77 +22,91 @@ class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) - + resources = { 'zero': pynini.string_file(get_abs_path("data/numbers/zero.tsv")), 'digit': pynini.string_file(get_abs_path("data/numbers/digit.tsv")), 'teen': pynini.string_file(get_abs_path("data/numbers/teen.tsv")), - 'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + 'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv")), } self.zero, self.digit, self.teen, self.ties = resources.values() - + with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f: self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2} - + with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f: - special = {parts[0]: {'std': parts[1], 'alt': parts[2]} for line in f - if len(parts := line.strip().split('\t')) >= 3} - - self.special_digits = pynini.union(*[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]]) + special = { + parts[0]: {'std': parts[1], 'alt': parts[2]} + for line in f + if len(parts := line.strip().split('\t')) >= 3 + } + + self.special_digits = pynini.union( + *[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]] + ) self.linh_digits = pynini.union(*[pynini.cross(k, special[k]["std"]) for k in ["1", "4", "5"]], self.digit) - + self.single_digit = self.digit - + self.two_digit = pynini.union( self.teen, self.ties + pynutil.delete("0"), - self.ties + insert_space + pynini.union( - self.special_digits, - pynini.union("2", "3", "6", "7", "8", "9") @ self.digit - ) + self.ties + + insert_space + + pynini.union(self.special_digits, pynini.union("2", "3", "6", "7", "8", "9") @ self.digit), ) - + self.hundreds_pattern = pynini.union( self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("00"), - - self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("0") - + insert_space + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits, - - self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + insert_space + self.two_digit + self.single_digit + + insert_space + + pynutil.insert(self.magnitudes["hundred"]) + + pynutil.delete("0") + + insert_space + + pynutil.insert(self.magnitudes["linh"]) + + insert_space + + self.linh_digits, + self.single_digit + + insert_space + + pynutil.insert(self.magnitudes["hundred"]) + + insert_space + + self.two_digit, ) - + self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern - + self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3) self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand) self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million) - + self.graph = pynini.union( - self.billion, self.million, self.thousand, self.hundreds, - self.two_digit, self.single_digit, self.zero + self.billion, self.million, self.thousand, self.hundreds, self.two_digit, self.single_digit, self.zero ).optimize() - + self.single_digits_graph = self.single_digit | self.zero self.graph_with_and = self.graph - + self.fst = self.add_tokens( - pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + - pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + + pynutil.insert("integer: \"") + + self.graph + + pynutil.insert("\"") ).optimize() - + def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None): magnitude_word = self.magnitudes[name] - + patterns = [] for digits in range(min_digits, max_digits + 1): leading_digits = digits - zero_count leading_fst = {1: self.single_digit, 2: self.two_digit, 3: self.hundreds_pattern}.get( - leading_digits, self.hundreds_pattern) - + leading_digits, self.hundreds_pattern + ) + prefix = leading_fst + insert_space + pynutil.insert(magnitude_word) - + digit_patterns = [prefix + pynutil.delete("0" * zero_count)] - + if prev_pattern: digit_patterns.append(prefix + insert_space + prev_pattern) @@ -101,8 +115,12 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre remaining_digits = zero_count - trailing_zeros if remaining_digits == 1: trailing_patterns.append( - prefix + pynutil.delete("0" * trailing_zeros) + insert_space + - pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits + prefix + + pynutil.delete("0" * trailing_zeros) + + insert_space + + pynutil.insert(self.magnitudes["linh"]) + + insert_space + + self.linh_digits ) elif remaining_digits == 2: trailing_patterns.append( @@ -113,25 +131,51 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.hundreds_pattern ) digit_patterns.extend(trailing_patterns) - + if name == "million" and digits == 7: - digit_patterns.extend([ - prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + - pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + - pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits, - - prefix + pynutil.delete("0") + insert_space + self.two_digit + insert_space + - pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + - pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits - ]) + digit_patterns.extend( + [ + prefix + + pynutil.delete("00") + + insert_space + + self.single_digit + + insert_space + + pynutil.insert(self.magnitudes["thousand"]) + + pynutil.delete("00") + + insert_space + + pynutil.insert(self.magnitudes["linh"]) + + insert_space + + self.linh_digits, + prefix + + pynutil.delete("0") + + insert_space + + self.two_digit + + insert_space + + pynutil.insert(self.magnitudes["thousand"]) + + pynutil.delete("00") + + insert_space + + pynutil.insert(self.magnitudes["linh"]) + + insert_space + + self.linh_digits, + ] + ) elif name == "billion" and digits == 10: digit_patterns.append( - prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + - pynutil.insert(self.magnitudes["million"]) + pynutil.delete("00") + insert_space + - self.single_digit + insert_space + pynutil.insert(self.magnitudes["thousand"]) + - insert_space + self.hundreds_pattern + prefix + + pynutil.delete("00") + + insert_space + + self.single_digit + + insert_space + + pynutil.insert(self.magnitudes["million"]) + + pynutil.delete("00") + + insert_space + + self.single_digit + + insert_space + + pynutil.insert(self.magnitudes["thousand"]) + + insert_space + + self.hundreds_pattern ) - + patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns)) - - return pynini.union(*patterns) \ No newline at end of file + + return pynini.union(*patterns) diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py index 044c6494c..1e08cb02d 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py @@ -35,4 +35,4 @@ def __init__(self, deterministic: bool = True): graph = pynutil.insert('name: "') + punct + pynutil.insert('"') final_graph = pynutil.insert("punctuation { ") + graph + pynutil.insert(" }") - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 004e51c9d..7c46c786a 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -45,7 +45,10 @@ def __init__( far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, f"vi_tn_{deterministic}_deterministic_{input_case}_tokenize.far",) + far_file = os.path.join( + cache_dir, + f"vi_tn_{deterministic}_deterministic_{input_case}_tokenize.far", + ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logger.info(f"ClassifyFst.fst was restored from {far_file}.") @@ -72,9 +75,9 @@ def __init__( logger.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") classify = ( - pynutil.add_weight(whitelist_graph, 0.8) - | pynutil.add_weight(cardinal_graph, 0.9) - | pynutil.add_weight(word_graph, 100) + pynutil.add_weight(whitelist_graph, 0.8) + | pynutil.add_weight(cardinal_graph, 0.9) + | pynutil.add_weight(word_graph, 100) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") @@ -88,4 +91,4 @@ def __init__( self.fst = graph.optimize() if far_file: - generator_main(far_file, {"tokenize_and_classify": self.fst}) \ No newline at end of file + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py index 5ffd7732e..aed5e356a 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py @@ -15,8 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels class WhiteListFst(GraphFst): @@ -67,4 +67,4 @@ def _get_whitelist_graph(input_case, file): self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize() # Add tokens wrapper - self.fst = self.add_tokens(self.fst) \ No newline at end of file + self.fst = self.add_tokens(self.fst) diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py index c3e6d408e..f0be213c7 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/word.py +++ b/nemo_text_processing/text_normalization/vi/taggers/word.py @@ -31,4 +31,4 @@ class WordFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") - self.fst = word.optimize() \ No newline at end of file + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py index 602b8a347..bc443be41 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py index 5ca695673..530c3dfce 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py @@ -52,4 +52,4 @@ def __init__(self, deterministic: bool = True): # Delete the token structure and create final FST delete_tokens = self.delete_tokens(self.numbers) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index 772b2b5f5..fff63933e 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -25,7 +25,7 @@ def __init__(self, deterministic: bool = True): # Initialize verbalizers cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst - + whitelist = WhiteListFst(deterministic=deterministic) whitelist_graph = whitelist.fst @@ -35,4 +35,4 @@ def __init__(self, deterministic: bool = True): # Combine all verbalizers graph = cardinal_graph | whitelist_graph | word_graph - self.fst = graph \ No newline at end of file + self.fst = graph diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py index a049a5796..cd9ec39eb 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py @@ -31,7 +31,7 @@ class VerbalizeFinalFst(GraphFst): """ Finite state transducer that verbalizes an entire Vietnamese sentence, e.g. - tokens { name: "xin" } tokens { cardinal { integer: "một trăm" } } tokens { name: "chào" } + tokens { name: "xin" } tokens { cardinal { integer: "một trăm" } } tokens { name: "chào" } -> xin một trăm chào Args: @@ -69,4 +69,4 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = graph.optimize() if far_file: - generator_main(far_file, {"verbalize": self.fst}) \ No newline at end of file + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py index 0b77ee498..6e0699827 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py @@ -39,4 +39,4 @@ def __init__(self, deterministic: bool = True): ) graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py index 3ad9a1a82..f9547acba 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py @@ -34,4 +34,4 @@ def __init__(self, deterministic: bool = True): char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) - self.fst = graph.optimize() \ No newline at end of file + self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/vi/test_cardinal.py b/tests/nemo_text_processing/vi/test_cardinal.py index b745b1e09..636932aed 100644 --- a/tests/nemo_text_processing/vi/test_cardinal.py +++ b/tests/nemo_text_processing/vi/test_cardinal.py @@ -25,8 +25,8 @@ class TestCardinal: - inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) - + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -34,8 +34,10 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) - + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + normalizer_with_audio = ( NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if CACHE_DIR and RUN_AUDIO_BASED_TESTS @@ -55,4 +57,4 @@ def test_norm(self, test_input, expected): n_tagged=30, punct_post_process=False, ) - assert expected in pred_non_deterministic, f"input: {test_input}" \ No newline at end of file + assert expected in pred_non_deterministic, f"input: {test_input}"