NVIDIA · tbartley94 · Aug 12, 2025 · Aug 2, 2025 · Aug 2, 2025 · Aug 2, 2025
diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py
@@ -140,9 +140,9 @@ def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = N
             acc = acc + 1
         else:
             if input:
-                print(f"inpu: {json.dumps(input[i])}")
-            print(f"gold: {json.dumps(label_norm)}")
-            print(f"pred: {json.dumps(pred_norm)}")
+                print(f"input: {json.dumps(input[i], ensure_ascii=True)}")
+            print(f"gold: {json.dumps(label_norm, ensure_ascii=True)}")
+            print(f"pred: {json.dumps(pred_norm, ensure_ascii=True)}")
     return acc / nums
 
 

diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
@@ -176,7 +176,11 @@ def __init__(
             from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst
         elif lang == 'vi':
             from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst
+            from nemo_text_processing.text_normalization.vi.verbalizers.post_processing import PostProcessingFst
             from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst
+
+            if post_process:
+                self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache)
         else:
             raise NotImplementedError(f"Language {lang} has not been supported yet.")
 
@@ -377,7 +381,7 @@ def normalize(
                 return text
         output = SPACE_DUP.sub(' ', output[1:])
 
-        if self.lang == "en" and hasattr(self, 'post_processor'):
+        if self.lang in ["en", "vi"] and hasattr(self, 'post_processor'):
             output = self.post_process(output)
 
         if punct_post_process:

diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py
@@ -35,7 +35,7 @@ def parse_args():
     parser.add_argument(
         "--lang",
         help="language",
-        choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'],
+        choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'vi'],
         default="en",
         type=str,
     )
@@ -104,8 +104,6 @@ def parse_args():
     print("- Accuracy: " + str(sum(token_weighted_accuracy) / sum(token_count_per_type.values())))
     print(" - Total: " + str(sum(token_count_per_type.values())), '\n')
 
-    print(" - Total: " + str(sum(token_count_per_type.values())), '\n')
-
     for token_type in token_accuracy:
         if token_type not in known_types:
             raise ValueError("Unexpected token type: " + token_type)

diff --git a/nemo_text_processing/text_normalization/vi/data/measure/__init__.py b/nemo_text_processing/text_normalization/vi/data/measure/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv b/nemo_text_processing/text_normalization/vi/data/measure/base_units.tsv
@@ -0,0 +1,20 @@
+m	mét
+m2	mét vuông
+m3	mét khối
+m²	mét vuông
+m³	mét khối
+g	gam
+l	lít
+s	giây
+v	vôn
+w	oát
+hz	hẹc
+A	am pe
+b	bai
+B	byte
+pa	pascal
+ω	ohm
+Ω	ôm
+h	giờ
+min	phút
+hr	giờ
diff --git a/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv b/nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv
@@ -0,0 +1,25 @@
+°f	độ f
+°c	độ c
+°k	độ k
+ha	héc ta
+mi	mile
+ft	foot
+inch	inch
+yd	yard
+%	phần trăm
+hp	mã lực
+rad	radian
+kwh	ki lô oát giờ
+kbps	kilobit trên giây
+mbps	megabit trên giây
+ghz	gi ga hẹc
+mhz	mê ga hẹc
+tw	tê ra oát
+kcal	ki lô calo
+gb	gi ga bai
+mb	mê ga bai
+mV	mi li vôn
+MV	mê ga vôn
+tb	terabyte
+pb	petabyte
+g	gam
diff --git a/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv b/nemo_text_processing/text_normalization/vi/data/measure/prefixes.tsv
@@ -0,0 +1,17 @@
+k	ki lô
+M	mê ga
+G	gi ga
+T	tê ra
+P	pê ta
+E	ex xa
+h	hếc tô
+da	đề ca
+d	đề xi
+c	xăng ti
+m	mi li
+µ	mi crô
+μ	mi cờ rô
+n	na nô
+p	pi cô
+f	fem tô
+a	át tô
diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_bases.tsv
@@ -0,0 +1,8 @@
+g	gam
+m	mét
+m²	mét vuông
+m2	mét vuông
+m³	mét khối
+m3	mét khối
+l	lít
+B	bai
diff --git a/..._normalization/vi/data/money/per_unit.tsv → ...ion/vi/data/money/per_unit_non_metric.tsv b/..._normalization/vi/data/money/per_unit.tsv → ...ion/vi/data/money/per_unit_non_metric.tsv
@@ -1,5 +1,4 @@
 /giờ	trên giờ
-/g	trên giờ
 /h	trên giờ
 /ngày	trên ngày
 /d	trên ngày
@@ -13,33 +12,17 @@
 /lần	một lần
 /cái	một cái
 /chiếc	một chiếc
-/kg	một ki lô gam
-/g	một gam
-/cm	một xăng ti mét
-/m	một mét
-/km	một ki lô mét
-/cm²	một xăng ti mét vuông
-/m²	một mét vuông
-/m2	một mét vuông
-/m³	một mét khối
-/m3	một mét khối
-/l	một lít
-/ml	một mi li lít
 /người	một người
 /chỗ	một chỗ
 /bài	một bài
 /trang	một trang
 /từ	một từ
 /đồng	một đồng
-/KB	một kilobyte
-/GB	một gigabyte
-/MB	một megabyte
-/TB	một terabyte
-/tấn	một tấn
 /đêm	một đêm
 /buổi	một buổi
 /ca	một ca
 /dự án	một dự án
 /lớp	một lớp
 /khóa	một khóa
-/suất	một suất 
+/suất	một suất
+/tấn	một tấn
diff --git a/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv b/nemo_text_processing/text_normalization/vi/data/money/per_unit_prefixes.tsv
@@ -0,0 +1,6 @@
+k	ki lô
+M	mê ga
+G	gi ga
+c	xăng ti
+m	mi li
+T	tê ra
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv
@@ -1,3 +1,3 @@
 1	một	mốt
 4	bốn	tư
-5	năm	lăm 
+5	năm	lăm
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv
@@ -1,5 +1,8 @@
 thousand	nghìn
 million	triệu
 billion	tỷ
+trillion	nghìn tỷ
+quadrillion	triệu tỷ
+quintillion	tỷ tỷ
 hundred	trăm
 linh	linh
diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py
@@ -62,6 +62,11 @@ def extract_field(field_name):
     return pynutil.delete(f"{field_name}:") + delete_space + pynutil.delete("\"") + quoted_text + pynutil.delete("\"")
 
 
+def extract_wrapper_content(wrapper_type: str, content_graph):
+    """Helper to extract content from wrapper like 'decimal { ... }'"""
+    return pynutil.delete(f"{wrapper_type} {{") + delete_space + content_graph + delete_space + pynutil.delete("}")
+
+
 def convert_space(fst) -> "pynini.FstLike":
     """
     Converts space to nonbreaking space.