Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 nhất
4 tư
71 changes: 71 additions & 0 deletions nemo_text_processing/text_normalization/vi/taggers/decimal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels


class DecimalFst(GraphFst):
"""
Finite state transducer for classifying Vietnamese decimal numbers, e.g.
-12,5 tỷ -> decimal { negative: "true" integer_part: "mười hai" fractional_part: "năm" quantity: "tỷ" }
818,303 -> decimal { integer_part: "tám trăm mười tám" fractional_part: "ba không ba" }
0,2 triệu -> decimal { integer_part: "không" fractional_part: "hai" quantity: "triệu" }

Args:
cardinal: CardinalFst instance for processing integer parts
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
"""

def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="decimal", kind="classify", deterministic=deterministic)

cardinal_graph = cardinal.graph_with_and
self.graph = cardinal.single_digits_graph.optimize()
if not deterministic:
self.graph = self.graph | cardinal_graph

single_digit_map = pynini.union(
*[pynini.cross(k, v) for k, v in load_labels(get_abs_path("data/numbers/digit.tsv"))],
*[pynini.cross(k, v) for k, v in load_labels(get_abs_path("data/numbers/zero.tsv"))]
)

quantity_units = pynini.union(*[v for _, v in load_labels(get_abs_path("data/numbers/magnitudes.tsv"))])

integer_part = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
fractional_part = (
pynutil.insert("fractional_part: \"")
+ (single_digit_map + pynini.closure(pynutil.insert(" ") + single_digit_map))
+ pynutil.insert("\"")
)

decimal_pattern = (
(integer_part + pynutil.insert(" ")).ques + pynutil.delete(",") + pynutil.insert(" ") + fractional_part
)

quantity_suffix = (
pynutil.delete(" ").ques + pynutil.insert(" quantity: \"") + quantity_units + pynutil.insert("\"")
)

decimal_with_quantity = decimal_pattern + quantity_suffix
cardinal_with_quantity = integer_part + quantity_suffix

negative = (pynutil.insert("negative: ") + pynini.cross("-", "\"true\" ")).ques
final_graph = negative + pynini.union(decimal_pattern, decimal_with_quantity, cardinal_with_quantity)

self.fst = self.add_tokens(final_graph).optimize()
62 changes: 62 additions & 0 deletions nemo_text_processing/text_normalization/vi/taggers/ordinal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels


class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying Vietnamese ordinals, e.g.
thứ 1 -> ordinal { integer: "nhất" }
thứ 4 -> ordinal { integer: "tư" }
thứ 15 -> ordinal { integer: "mười lăm" }
Args:
cardinal: CardinalFst for number conversion
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
"""

def __init__(self, cardinal, deterministic: bool = True):
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

prefix = "thứ "
number_pattern = pynini.closure(NEMO_DIGIT, 1)

ordinal_exceptions = {
row[0]: row[1] for row in load_labels(get_abs_path("data/ordinal/ordinal_exceptions.tsv"))
}

exception_patterns = []
for digit, word in ordinal_exceptions.items():
exception_patterns.append(pynini.cross(digit, word))

exception_graph = pynini.union(*exception_patterns) if exception_patterns else None

combined_graph = cardinal.graph
if exception_graph:
combined_graph = pynini.union(exception_graph, cardinal.graph)

self.graph = (
pynutil.delete(prefix)
+ pynutil.insert("integer: \"")
+ pynini.compose(number_pattern, combined_graph)
+ pynutil.insert("\"")
)

self.fst = self.add_tokens(self.graph).optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
generator_main,
)
from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.vi.taggers.word import WordFst
Expand Down Expand Up @@ -74,8 +76,20 @@ def __init__(
word_graph = WordFst(deterministic=deterministic).fst
logger.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes")

start_time = time.time()
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
ordinal_graph = ordinal.fst
logger.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes")

start_time = time.time()
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
decimal_graph = decimal.fst
logger.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes")

classify = (
pynutil.add_weight(whitelist_graph, 0.8)
| pynutil.add_weight(ordinal_graph, 0.81)
| pynutil.add_weight(decimal_graph, 0.85)
| pynutil.add_weight(cardinal_graph, 0.9)
| pynutil.add_weight(word_graph, 100)
)
Expand Down
78 changes: 78 additions & 0 deletions nemo_text_processing/text_normalization/vi/verbalizers/decimal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space


class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing Vietnamese decimal numbers, e.g.
decimal { negative: "true" integer_part: "mười hai" fractional_part: "năm" quantity: "tỷ" } -> âm mười hai phẩy năm tỷ
decimal { integer_part: "tám trăm mười tám" fractional_part: "ba không ba" } -> tám trăm mười tám phẩy ba không ba
decimal { integer_part: "không" fractional_part: "hai" quantity: "triệu" } -> không phẩy hai triệu

Args:
cardinal: CardinalFst instance for handling integer verbalization
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, cardinal, deterministic: bool = True):
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)

# Handle negative sign - Vietnamese uses "âm" for negative numbers
self.optional_sign = pynini.cross("negative: \"true\"", "âm ")
if not deterministic:
# Alternative ways to say negative in Vietnamese
self.optional_sign |= pynini.cross("negative: \"true\"", "trừ ")

self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)

self.integer = pynutil.delete("integer_part:") + cardinal.integer
self.optional_integer = pynini.closure(self.integer + delete_space + insert_space, 0, 1)

# Handle fractional part - Vietnamese uses "phẩy" (comma) instead of "point"
self.fractional_default = (
pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)

self.fractional = pynutil.insert("phẩy ") + self.fractional_default

self.quantity = (
delete_space
+ insert_space
+ pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
self.optional_quantity = pynini.closure(self.quantity, 0, 1)

graph = self.optional_sign + (
self.integer
| (self.integer + self.quantity)
| (self.optional_integer + self.fractional + self.optional_quantity)
)

self.numbers = graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
48 changes: 48 additions & 0 deletions nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space


class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing Vietnamese ordinals, e.g.
ordinal { integer: "nhất" } -> thứ nhất
ordinal { integer: "tư" } -> thứ tư
ordinal { integer: "mười lăm" } -> thứ mười lăm
ordinal { integer: "một trăm" } -> thứ một trăm

Args:
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
"""

def __init__(self, deterministic: bool = True):
super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)

quoted_content = pynini.closure(NEMO_NOT_QUOTE)

integer = (
pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"")
)

ordinal_pattern = pynutil.insert("thứ ") + integer

self.ordinal_graph = ordinal_pattern

delete_tokens = self.delete_tokens(self.ordinal_graph)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst
from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst
from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst


Expand All @@ -32,7 +34,13 @@ def __init__(self, deterministic: bool = True):
word = WordFst(deterministic=deterministic)
word_graph = word.fst

ordinal = OrdinalFst(deterministic=deterministic)
ordinal_graph = ordinal.fst

decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
decimal_graph = decimal.fst

# Combine all verbalizers
graph = cardinal_graph | whitelist_graph | word_graph
graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph

self.fst = graph
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
0,2 triệu~không phẩy hai triệu
18 vạn~mười tám vạn
818,303~tám trăm mười tám phẩy ba không ba
-99,95 tỷ~âm chín mươi chín phẩy chín năm tỷ
60,240~sáu mươi phẩy hai bốn không
-0,007~âm không phẩy không không bảy
123,000~một trăm hai mươi ba phẩy không không không
1,5 triệu~một phẩy năm triệu
3,14 tỷ~ba phẩy một bốn tỷ
10,01 vạn~mười phẩy không một vạn
-12,5~âm mười hai phẩy năm
0,0001~không phẩy không không không một
999,999~chín trăm chín mươi chín phẩy chín chín chín
1,01~một phẩy không một
-1,01~âm một phẩy không một
15,6~mười lăm phẩy sáu
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
thứ 1~thứ nhất
hôm nay là thứ hai~hôm nay là thứ hai
thứ 3 là ngày giữa tuần~thứ ba là ngày giữa tuần
thứ 4 nên làm gì~thứ tư nên làm gì
thứ 7~thứ bảy
con giáp thứ 13~con giáp thứ mười ba
thứ 1~thứ nhất
thứ 4~thứ tư
thứ 2~thứ hai
thứ 3~thứ ba
thứ 5~thứ năm
thứ 6~thứ sáu
thứ 7~thứ bảy
thứ 8~thứ tám
thứ 9~thứ chín
thứ 10~thứ mười
thứ 11~thứ mười một
thứ 12~thứ mười hai
thứ 15~thứ mười lăm
thứ 21~thứ hai mươi mốt
thứ 24~thứ hai mươi tư
thứ 34~thứ ba mươi tư
thứ 100~thứ một trăm
thứ 101~thứ một trăm linh một
thứ 104~thứ một trăm linh bốn
thứ 234~thứ hai trăm ba mươi tư
thứ 1000~thứ một nghìn
thứ 1234~thứ một nghìn hai trăm ba mươi tư
hôm nay thứ 2~hôm nay thứ hai
đứng thứ 15~đứng thứ mười lăm
Loading