Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ pipeline {
}
}

stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') {
stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') {
when {
anyOf {
branch 'main'
Expand All @@ -200,6 +200,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: VI TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: HU TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}'
Expand Down
3 changes: 3 additions & 0 deletions nemo_text_processing/text_normalization/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ def __init__(
elif lang == 'ja':
from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst
elif lang == 'vi':
from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst
else:
raise NotImplementedError(f"Language {lang} has not been supported yet.")

Expand Down
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/vi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/vi/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
1 một
2 hai
3 ba
4 bốn
5 năm
6 sáu
7 bảy
8 tám
9 chín
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1 một mốt
4 bốn tư
5 năm lăm
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
thousand nghìn
million triệu
billion tỷ
hundred trăm
linh linh
10 changes: 10 additions & 0 deletions nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
10 mười
11 mười một
12 mười hai
13 mười ba
14 mười bốn
15 mười lăm
16 mười sáu
17 mười bảy
18 mười tám
19 mười chín
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
2 hai mươi
3 ba mươi
4 bốn mươi
5 năm mươi
6 sáu mươi
7 bảy mươi
8 tám mươi
9 chín mươi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0 không
Empty file.
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/vi/taggers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
181 changes: 181 additions & 0 deletions nemo_text_processing/text_normalization/vi/taggers/cardinal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.vi.utils import get_abs_path


class CardinalFst(GraphFst):
def __init__(self, deterministic: bool = True):
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

resources = {
'zero': pynini.string_file(get_abs_path("data/numbers/zero.tsv")),
'digit': pynini.string_file(get_abs_path("data/numbers/digit.tsv")),
'teen': pynini.string_file(get_abs_path("data/numbers/teen.tsv")),
'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv")),
}
self.zero, self.digit, self.teen, self.ties = resources.values()

with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f:
self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2}

with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f:
special = {
parts[0]: {'std': parts[1], 'alt': parts[2]}
for line in f
if len(parts := line.strip().split('\t')) >= 3
}

self.special_digits = pynini.union(
*[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]]
)
self.linh_digits = pynini.union(*[pynini.cross(k, special[k]["std"]) for k in ["1", "4", "5"]], self.digit)

self.single_digit = self.digit

self.two_digit = pynini.union(
self.teen,
self.ties + pynutil.delete("0"),
self.ties
+ insert_space
+ pynini.union(self.special_digits, pynini.union("2", "3", "6", "7", "8", "9") @ self.digit),
)

self.hundreds_pattern = pynini.union(
self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("00"),
self.single_digit
+ insert_space
+ pynutil.insert(self.magnitudes["hundred"])
+ pynutil.delete("0")
+ insert_space
+ pynutil.insert(self.magnitudes["linh"])
+ insert_space
+ self.linh_digits,
self.single_digit
+ insert_space
+ pynutil.insert(self.magnitudes["hundred"])
+ insert_space
+ self.two_digit,
)

self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern

self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3)
self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand)
self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million)

self.graph = pynini.union(
self.billion, self.million, self.thousand, self.hundreds, self.two_digit, self.single_digit, self.zero
).optimize()

self.single_digits_graph = self.single_digit | self.zero
self.graph_with_and = self.graph

self.fst = self.add_tokens(
pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
+ pynutil.insert("integer: \"")
+ self.graph
+ pynutil.insert("\"")
).optimize()

def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None):
magnitude_word = self.magnitudes[name]

patterns = []
for digits in range(min_digits, max_digits + 1):
leading_digits = digits - zero_count
leading_fst = {1: self.single_digit, 2: self.two_digit, 3: self.hundreds_pattern}.get(
leading_digits, self.hundreds_pattern
)

prefix = leading_fst + insert_space + pynutil.insert(magnitude_word)

digit_patterns = [prefix + pynutil.delete("0" * zero_count)]

if prev_pattern:
digit_patterns.append(prefix + insert_space + prev_pattern)

trailing_patterns = []
for trailing_zeros in range(zero_count):
remaining_digits = zero_count - trailing_zeros
if remaining_digits == 1:
trailing_patterns.append(
prefix
+ pynutil.delete("0" * trailing_zeros)
+ insert_space
+ pynutil.insert(self.magnitudes["linh"])
+ insert_space
+ self.linh_digits
)
elif remaining_digits == 2:
trailing_patterns.append(
prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.two_digit
)
elif remaining_digits == 3:
trailing_patterns.append(
prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.hundreds_pattern
)
digit_patterns.extend(trailing_patterns)

if name == "million" and digits == 7:
digit_patterns.extend(
[
prefix
+ pynutil.delete("00")
+ insert_space
+ self.single_digit
+ insert_space
+ pynutil.insert(self.magnitudes["thousand"])
+ pynutil.delete("00")
+ insert_space
+ pynutil.insert(self.magnitudes["linh"])
+ insert_space
+ self.linh_digits,
prefix
+ pynutil.delete("0")
+ insert_space
+ self.two_digit
+ insert_space
+ pynutil.insert(self.magnitudes["thousand"])
+ pynutil.delete("00")
+ insert_space
+ pynutil.insert(self.magnitudes["linh"])
+ insert_space
+ self.linh_digits,
]
)
elif name == "billion" and digits == 10:
digit_patterns.append(
prefix
+ pynutil.delete("00")
+ insert_space
+ self.single_digit
+ insert_space
+ pynutil.insert(self.magnitudes["million"])
+ pynutil.delete("00")
+ insert_space
+ self.single_digit
+ insert_space
+ pynutil.insert(self.magnitudes["thousand"])
+ insert_space
+ self.hundreds_pattern
)

patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns))

return pynini.union(*patterns)
38 changes: 38 additions & 0 deletions nemo_text_processing/text_normalization/vi/taggers/punctuation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import GraphFst


class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation for Vietnamese
"""

def __init__(self, deterministic: bool = True):
super().__init__(name="punctuation", kind="classify", deterministic=deterministic)

# Common punctuation marks
# Use escape() for brackets since they are special regex chars
s = "!#$%&'()*+,-./:;<=>?@^_`{|}~–—――…»«„“›‹‚‘’⟨⟩"
punct = pynini.union(*s)

# Create the punctuation transduction
graph = pynutil.insert('name: "') + punct + pynutil.insert('"')

final_graph = pynutil.insert("punctuation { ") + graph + pynutil.insert(" }")
self.fst = final_graph.optimize()
Loading