diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py index bc443be41..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/__init__.py +++ b/nemo_text_processing/text_normalization/vi/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/__init__.py b/nemo_text_processing/text_normalization/vi/data/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/date/__init__.py b/nemo_text_processing/text_normalization/vi/data/date/__init__.py new file mode 100644 index 000000000..b2de1dca7 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/date/days.tsv b/nemo_text_processing/text_normalization/vi/data/date/days.tsv new file mode 100644 index 000000000..5b70479a6 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/days.tsv @@ -0,0 +1,40 @@ +01 một +02 hai +03 ba +04 bốn +05 năm +06 sáu +07 bảy +08 tám +09 chín +1 một +2 hai +3 ba +4 bốn +5 năm +6 sáu +7 bảy +8 tám +9 chín +10 mười +11 mười một +12 mười hai +13 mười ba +14 mười bốn +15 mười lăm +16 mười sáu +17 mười bảy +18 mười tám +19 mười chín +20 hai mươi +21 hai mươi mốt +22 hai mươi hai +23 hai mươi ba +24 hai mươi bốn +25 hai mươi lăm +26 hai mươi sáu +27 hai mươi bảy +28 hai mươi tám +29 hai mươi chín +30 ba mươi +31 ba mươi mốt \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/date/months.tsv b/nemo_text_processing/text_normalization/vi/data/date/months.tsv new file mode 100644 index 000000000..fb836fba1 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/months.tsv @@ -0,0 +1,21 @@ +1 một +2 hai +3 ba +4 tư +5 năm +6 sáu +7 bảy +8 tám +9 chín +10 mười +11 mười một +12 mười hai +01 một +02 hai +03 ba +04 tư +05 năm +06 sáu +07 bảy +08 tám +09 chín \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv new file mode 100644 index 000000000..31b49f955 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv @@ -0,0 +1,4 @@ +tcn trước công nguyên +scn sau công nguyên +TCN trước công nguyên +SCN sau công nguyên \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py b/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/roman/__init__.py b/nemo_text_processing/text_normalization/vi/data/roman/__init__.py new file mode 100644 index 000000000..b2de1dca7 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/roman/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv b/nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv new file mode 100644 index 000000000..e5f3d75a9 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv @@ -0,0 +1,12 @@ +thế kỉ +thế kỷ +thứ +chương +phần +mục +đoạn +năm +khoản +phụ lục +khóa +số \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv b/nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv new file mode 100644 index 000000000..d4d8ad20b --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv @@ -0,0 +1,13 @@ +I 1 +V 5 +X 10 +L 50 +C 100 +D 500 +M 1000 +IV 4 +IX 9 +XL 40 +XC 90 +CD 400 +CM 900 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py new file mode 100644 index 000000000..fae4ba088 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -0,0 +1,148 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py index bc443be41..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index fa0f04fad..58c59b530 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.vi.utils import get_abs_path diff --git a/nemo_text_processing/text_normalization/vi/taggers/date.py b/nemo_text_processing/text_normalization/vi/taggers/date.py new file mode 100644 index 000000000..36a1d1ae4 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/date.py @@ -0,0 +1,134 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying Vietnamese dates, e.g. + 15/01/2024 -> date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + tháng 4 2024 -> date { month: "tư" year: "hai nghìn hai mươi tư" } + ngày 15/01/2024 -> date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + ngày 12 tháng 5 năm 2025 -> date { day: "mười hai" month: "năm" year: "hai nghìn hai mươi lăm" } + năm 20 SCN -> date { year: "hai mươi" era: "sau công nguyên" } + """ + + def __init__(self, cardinal, deterministic: bool = True): + super().__init__(name="date", kind="classify", deterministic=deterministic) + + day_mappings = load_labels(get_abs_path("data/date/days.tsv")) + month_mappings = load_labels(get_abs_path("data/date/months.tsv")) + era_mappings = load_labels(get_abs_path("data/date/year_suffix.tsv")) + + day_digit = pynini.closure(NEMO_DIGIT, 1, 2) + month_digit = pynini.closure(NEMO_DIGIT, 1, 2) + year_digit = pynini.closure(NEMO_DIGIT, 1, 4) + separator = pynini.union("/", "-", ".") + + day_convert = pynini.string_map([(k, v) for k, v in day_mappings]) + month_convert = pynini.string_map([(k, v) for k, v in month_mappings]) + year_convert = pynini.compose(year_digit, cardinal.graph) + + era_to_full = {} + for abbr, full_form in era_mappings: + era_to_full[abbr.lower()] = full_form + era_to_full[abbr.upper()] = full_form + + era_convert = pynini.string_map([(k, v) for k, v in era_to_full.items()]) + + day_part = pynutil.insert("day: \"") + day_convert + pynutil.insert("\" ") + month_part = pynutil.insert("month: \"") + month_convert + pynutil.insert("\" ") + year_part = pynutil.insert("year: \"") + year_convert + pynutil.insert("\"") + month_final = pynutil.insert("month: \"") + month_convert + pynutil.insert("\"") + era_part = pynutil.insert("era: \"") + era_convert + pynutil.insert("\"") + + patterns = [] + + date_sep = day_part + pynutil.delete(separator) + month_part + pynutil.delete(separator) + year_part + patterns.append(pynini.compose(day_digit + separator + month_digit + separator + year_digit, date_sep)) + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, + pynutil.delete("ngày ") + date_sep, + ) + ) + + for sep in [separator, pynini.accep(" ")]: + patterns.append( + pynini.compose( + pynini.accep("tháng ") + month_digit + sep + year_digit, + pynutil.delete("tháng ") + month_part + pynutil.delete(sep) + year_part, + ) + ) + + day_month_sep = day_part + pynutil.delete(separator) + month_final + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit, pynutil.delete("ngày ") + day_month_sep + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final, + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("ngày ") + + day_digit + + pynini.accep(" tháng ") + + month_digit + + pynini.accep(" năm ") + + year_digit, + pynutil.delete("ngày ") + + day_part + + pynutil.delete(" tháng ") + + month_part + + pynutil.delete(" năm ") + + year_part, + ) + ) + + patterns.append(pynini.compose(pynini.accep("năm ") + year_digit, pynutil.delete("năm ") + year_part)) + + era_abbrs = list(era_to_full.keys()) + for era_abbr in era_abbrs: + patterns.append( + pynini.compose( + pynini.accep("năm ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), + pynutil.delete("năm ") + year_part + pynutil.delete(" ") + era_part, + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("năm thứ ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), + pynutil.delete("năm thứ ") + + pynutil.insert("ordinal: \"") + + year_convert + + pynutil.insert("\" ") + + pynutil.delete(" ") + + era_part, + ) + ) + + self.fst = self.add_tokens(pynini.union(*patterns)) diff --git a/nemo_text_processing/text_normalization/vi/taggers/decimal.py b/nemo_text_processing/text_normalization/vi/taggers/decimal.py index 0b314317b..0e0d605d0 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/fraction.py b/nemo_text_processing/text_normalization/vi/taggers/fraction.py index 807e96dab..ed3394120 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/taggers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/ordinal.py b/nemo_text_processing/text_normalization/vi/taggers/ordinal.py index d896bcef3..acacf63f7 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py index 1e08cb02d..d4610b3ee 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst class PunctuationFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/taggers/roman.py b/nemo_text_processing/text_normalization/vi/taggers/roman.py new file mode 100644 index 000000000..1c68c7875 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/roman.py @@ -0,0 +1,91 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels + + +class RomanFst(GraphFst): + """ + Finite state transducer for classifying roman numbers in Vietnamese context: + e.g. "thế kỉ XV" -> tokens { roman { key_cardinal: "thế kỉ" integer: "mười lăm" } } + e.g. "thế kỷ IV" -> tokens { roman { key_cardinal: "thế kỷ" integer: "bốn" } } + e.g. "thứ IV" -> tokens { roman { key_cardinal: "thứ" integer: "bốn" } } + e.g. "chương III" -> tokens { roman { key_cardinal: "chương" integer: "ba" } } + e.g. "phần ix" -> tokens { roman { key_cardinal: "phần" integer: "chín" } } + + Args: + cardinal: CardinalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="roman", kind="classify", deterministic=deterministic) + + key_words = [] + key_word_path = get_abs_path("data/roman/key_word.tsv") + for k_word in load_labels(key_word_path): + key_words.append(k_word[0]) + + key_words_fst = pynini.union(*[pynini.accep(word) for word in key_words]).optimize() + + roman_numeral_path = get_abs_path("data/roman/roman_numerals.tsv") + roman_numeral_pairs = load_labels(roman_numeral_path) + + roman_to_arabic = {} + for roman, value in roman_numeral_pairs: + roman_to_arabic[roman] = value + roman_to_arabic[roman.lower()] = value + + self.arabic_to_roman = {} + for roman, value in roman_numeral_pairs: + self.arabic_to_roman[int(value)] = roman + + valid_roman_pairs = [] + for i in range(1, 4000): + roman_upper = self._int_to_roman(i) + roman_lower = roman_upper.lower() + valid_roman_pairs.append((roman_upper, str(i))) + valid_roman_pairs.append((roman_lower, str(i))) + + roman_to_arabic_fst = pynini.string_map(valid_roman_pairs).optimize() + + cardinal_graph = cardinal.graph + + graph = ( + pynutil.insert("key_cardinal: \"") + + key_words_fst + + pynutil.insert("\"") + + pynini.accep(" ") + + pynutil.insert("integer: \"") + + pynini.compose(roman_to_arabic_fst, cardinal_graph) + + pynutil.insert("\"") + ).optimize() + + self.fst = self.add_tokens(graph).optimize() + + def _int_to_roman(self, num): + values = sorted(self.arabic_to_roman.keys(), reverse=True) + + roman_num = '' + for value in values: + while num >= value: + roman_num += self.arabic_to_roman[value] + num -= value + + return roman_num diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 73feb7182..d18e04903 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,17 +18,19 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, delete_space, generator_main, ) from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.vi.taggers.date import DateFst from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.vi.taggers.roman import RomanFst from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.vi.taggers.word import WordFst from nemo_text_processing.utils.logging import logger @@ -92,15 +94,27 @@ def __init__( fraction_graph = fraction.fst logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") + start_time = time.time() + date = DateFst(cardinal=cardinal, deterministic=deterministic) + date_graph = date.fst + logger.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") + + start_time = time.time() + roman = RomanFst(cardinal=cardinal, deterministic=deterministic) + roman_graph = roman.fst + logger.debug(f"roman: {time.time() - start_time: .2f}s -- {roman_graph.num_states()} nodes") + classify = ( - pynutil.add_weight(whitelist_graph, 0.8) - | pynutil.add_weight(ordinal_graph, 0.81) - | pynutil.add_weight(decimal_graph, 0.85) - | pynutil.add_weight(cardinal_graph, 0.9) - | pynutil.add_weight(fraction_graph, 1.0) + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(roman_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py index aed5e356a..d2775f205 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py index f0be213c7..d101204f1 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/word.py +++ b/nemo_text_processing/text_normalization/vi/taggers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_SPACE, GraphFst class WordFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/utils.py b/nemo_text_processing/text_normalization/vi/utils.py index 332330921..6b0871d9d 100644 --- a/nemo_text_processing/text_normalization/vi/utils.py +++ b/nemo_text_processing/text_normalization/vi/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py index bc443be41..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py index 530c3dfce..b096e759d 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/date.py b/nemo_text_processing/text_normalization/vi/verbalizers/date.py new file mode 100644 index 000000000..4e918e3d4 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/date.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing Vietnamese dates, e.g. + date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + -> ngày mười lăm tháng một năm hai nghìn hai mươi tư + + date { month: "tư" year: "hai nghìn hai mươi tư" } + -> tháng tư năm hai nghìn hai mươi tư + + date { year: "hai mươi" era: "sau công nguyên" } + -> năm hai mươi sau công nguyên + + date { ordinal: "mười" era: "trước công nguyên" } + -> năm thứ mười trước công nguyên + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + quoted_content = pynini.closure(NEMO_NOT_QUOTE) + + day_expr = pynutil.delete("day: \"") + quoted_content + pynutil.delete("\"") + day_with_prefix = pynutil.insert("ngày ") + day_expr + + month_expr = pynutil.delete("month: \"") + quoted_content + pynutil.delete("\"") + month_with_prefix = pynutil.insert("tháng ") + month_expr + + year_expr = pynutil.delete("year: \"") + quoted_content + pynutil.delete("\"") + year_with_prefix = pynutil.insert("năm ") + year_expr + + era_expr = pynutil.delete("era: \"") + quoted_content + pynutil.delete("\"") + + ordinal_expr = pynutil.delete("ordinal: \"") + quoted_content + pynutil.delete("\"") + ordinal_with_prefix = pynutil.insert("năm thứ ") + ordinal_expr + + date_graph = pynini.union( + day_with_prefix + + delete_space + + insert_space + + month_with_prefix + + delete_space + + insert_space + + year_with_prefix, + month_with_prefix + delete_space + insert_space + year_with_prefix, + day_with_prefix + delete_space + insert_space + month_with_prefix, + year_with_prefix, + year_with_prefix + delete_space + insert_space + era_expr, + ordinal_with_prefix + delete_space + insert_space + era_expr, + ) + + self.fst = self.delete_tokens(date_graph).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py b/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py index 8fe523b37..bcda3d757 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py index 77ace3454..328bbcded 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py index 7388f7df4..0a0bf3ac0 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py new file mode 100644 index 000000000..977f7e313 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space + + +class RomanFst(GraphFst): + """ + Finite state transducer for verbalizing Roman numerals in Vietnamese + e.g. tokens { roman { key_cardinal: "thế kỉ" integer: "mười lăm" } } -> thế kỉ mười lăm + e.g. tokens { roman { key_cardinal: "thế kỷ" integer: "bốn" } } -> thế kỷ bốn + e.g. tokens { roman { key_cardinal: "thứ" integer: "bốn" } } -> thứ bốn + e.g. tokens { roman { integer: "mười lăm" } } -> mười lăm + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="roman", kind="verbalize", deterministic=deterministic) + + key_cardinal = pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph_with_key = key_cardinal + delete_space + pynutil.insert(" ") + integer + graph_without_key = integer + graph = pynini.union(graph_with_key, graph_without_key) + delete_tokens = self.delete_tokens(graph) + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index e3d34b968..3c62c9651 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.vi.verbalizers.roman import RomanFst from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst class VerbalizeFst(GraphFst): @@ -44,7 +46,22 @@ def __init__(self, deterministic: bool = True): fraction = FractionFst(deterministic=deterministic) fraction_graph = fraction.fst + date = DateFst(deterministic=deterministic) + date_graph = date.fst + + roman = RomanFst(deterministic=deterministic) + roman_graph = roman.fst + # Combine all verbalizers - graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph + graph = ( + cardinal_graph + | whitelist_graph + | word_graph + | ordinal_graph + | decimal_graph + | fraction_graph + | date_graph + | roman_graph + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py index cd9ec39eb..aa8344459 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,14 +17,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, delete_space, generator_main, ) -from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst from nemo_text_processing.utils.logging import logger diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py index 6e0699827..7afda862e 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py index f9547acba..78aa1d7c1 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space class WordFst(GraphFst): diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..c95e00e97 --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt @@ -0,0 +1,13 @@ +ngày 15/01/2024~ngày mười lăm tháng một năm hai nghìn hai mươi tư +01/12/2023~ngày một tháng mười hai năm hai nghìn hai mươi ba +25-03-1975~ngày hai mươi lăm tháng ba năm một nghìn chín trăm bảy mươi lăm +10.05.2000~ngày mười tháng năm năm hai nghìn +tháng 1 2024~tháng một năm hai nghìn hai mươi tư +tháng 12 2023~tháng mười hai năm hai nghìn hai mươi ba +ngày 12 tháng 5 năm 2025~ngày mười hai tháng năm năm hai nghìn hai mươi lăm +tháng 5 năm nay~tháng năm năm nay +ngày 4 tháng này~ngày bốn tháng này +hôm nay là ngày 19/05/2025 sinh nhật Bác Hồ~hôm nay là ngày mười chín tháng năm năm hai nghìn hai mươi lăm sinh nhật Bác Hồ +ngày 14/4 hàng năm~ngày mười bốn tháng tư hàng năm +tháng 04/1969~tháng tư năm một nghìn chín trăm sáu mươi chín +ngày 12 tháng mười hai năm 2023~ngày mười hai tháng mười hai năm hai nghìn hai mươi ba \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt new file mode 100644 index 000000000..543ef052b --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt @@ -0,0 +1,59 @@ +thế kỉ XV~thế kỉ mười lăm +thế kỉ XX~thế kỉ hai mươi +thế kỉ XXI~thế kỉ hai mươi mốt +thế kỷ IV~thế kỷ bốn +thế kỷ V~thế kỷ năm +thứ I~thứ một +thứ V~thứ năm +thứ X~thứ mười +thứ XV~thứ mười lăm +chương III~chương ba +phần ix~phần chín +chương C~chương một trăm +mục XCIX~mục chín mươi chín +chương MMMCMXCIX~chương ba nghìn chín trăm chín mươi chín +thế kỉ xix~thế kỉ mười chín +thế kỷ vi~thế kỷ sáu +phần xl~phần bốn mươi +mục xc~mục chín mươi +mục cd~mục bốn trăm +mục cm~mục chín trăm +thứ viii~thứ tám +thứ ix~thứ chín +thứ xi~thứ mười một +chương lxxxviii~chương tám mươi tám +chương cccxlv~chương ba trăm bốn mươi lăm +thế kỉ XV và chương IX~thế kỉ mười lăm và chương chín +trong phần X có mục IV~trong phần mười có mục bốn +chương I~chương một +chương MMMCMXCIX~chương ba nghìn chín trăm chín mươi chín +CPU I9 là dòng cao cấp~CPU I9 là dòng cao cấp +Phiên bản V2.0 đã lỗi thời~Phiên bản V2.0 đã lỗi thời +đoạn II~đoạn hai +đoạn iv~đoạn bốn +đoạn VII~đoạn bảy +đoạn xii~đoạn mười hai +năm MCMXCIX~năm một nghìn chín trăm chín mươi chín +năm mmxx~năm hai nghìn hai mươi +khoản III~khoản ba +khoản vi~khoản sáu +khoản XIV~khoản mười bốn +khoản xxv~khoản hai mươi lăm +phụ lục I~phụ lục một +phụ lục v~phụ lục năm +phụ lục XII~phụ lục mười hai +phụ lục xx~phụ lục hai mươi +khóa VII~khóa bảy +khóa xi~khóa mười một +khóa XV~khóa mười lăm +khóa xxx~khóa ba mươi +số I~số một +số v~số năm +số X~số mười +số l~số năm mươi +đoạn IX mục III~đoạn chín mục ba +khoản II phụ lục IV~khoản hai phụ lục bốn +khóa XII số IX~khóa mười hai số chín +năm MMXXIII khoản V~năm hai nghìn hai mươi ba khoản năm +chương VII đoạn XI~chương bảy đoạn mười một +phần XX mục XV~phần hai mươi mục mười lăm \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_cardinal.py b/tests/nemo_text_processing/vi/test_cardinal.py index 636932aed..00bafe3f1 100644 --- a/tests/nemo_text_processing/vi/test_cardinal.py +++ b/tests/nemo_text_processing/vi/test_cardinal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_cardinal.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_date.py b/tests/nemo_text_processing/vi/test_date.py index 90885b6e4..b3da475db 100644 --- a/tests/nemo_text_processing/vi/test_date.py +++ b/tests/nemo_text_processing/vi/test_date.py @@ -15,28 +15,45 @@ import pytest from parameterized import parameterized -from ..utils import CACHE_DIR, parse_test_case_file +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -try: - from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - - PYNINI_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - PYNINI_AVAILABLE = False +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestDate: - inverse_normalizer = ( - InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None - ) + + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_date.txt')) - @pytest.mark.skipif( - not PYNINI_AVAILABLE, - reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", - ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + normalizer_with_audio = ( + NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + if CACHE_DIR and RUN_AUDIO_BASED_TESTS + else None + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + + if self.normalizer_with_audio: + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, + n_tagged=30, + punct_post_process=False, + ) + assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/vi/test_decimal.py b/tests/nemo_text_processing/vi/test_decimal.py index a7b2103a8..73ed99f54 100644 --- a/tests/nemo_text_processing/vi/test_decimal.py +++ b/tests/nemo_text_processing/vi/test_decimal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_decimal.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_fraction.py b/tests/nemo_text_processing/vi/test_fraction.py index 1751c7b8a..efa35fcce 100644 --- a/tests/nemo_text_processing/vi/test_fraction.py +++ b/tests/nemo_text_processing/vi/test_fraction.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_fraction.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_ordinal.py b/tests/nemo_text_processing/vi/test_ordinal.py index 3235e407a..9b15bd0c4 100644 --- a/tests/nemo_text_processing/vi/test_ordinal.py +++ b/tests/nemo_text_processing/vi/test_ordinal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_ordinal.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_roman.py b/tests/nemo_text_processing/vi/test_roman.py new file mode 100644 index 000000000..a942eb140 --- /dev/null +++ b/tests/nemo_text_processing/vi/test_roman.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestRoman: + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + normalizer_with_audio = ( + NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + if CACHE_DIR and RUN_AUDIO_BASED_TESTS + else None + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_roman.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + + if self.normalizer_with_audio: + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, + n_tagged=30, + punct_post_process=False, + ) + assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh index 6a277c28c..7c8b184bf 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh @@ -28,10 +28,10 @@ testTNCardinal() { runtest $input } -# testTNDate() { -# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_date.txt -# runtest $input -# } +testTNDate() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_date.txt + runtest $input +} testTNDecimal() { input=$PROJECT_DIR/vi/data_text_normalization/test_cases_decimal.txt @@ -48,6 +48,11 @@ testTNFraction() { runtest $input } +testTNRoman() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_roman.txt + runtest $input +} + # testTNTime() { # input=$PROJECT_DIR/vi/data_text_normalization/test_cases_time.txt # runtest $input