From 66ce1d3d36349015244f9329e088e80524b9786d Mon Sep 17 00:00:00 2001
From: folivoramanh <palasek182@gmail.com>
Date: Thu, 12 Jun 2025 12:28:49 +0700
Subject: [PATCH 1/4] Add Vietnamese text normalization for cardinal semiotic
 class

Signed-off-by: folivoramanh <palasek182@gmail.com>
---
 .../text_normalization/normalize.py           |   3 +
 .../text_normalization/vi/__init__.py         |  13 +
 .../vi/data/numbers/__init__.py               |   0
 .../vi/data/numbers/digit.tsv                 |   9 +
 .../vi/data/numbers/digit_special.tsv         |   3 +
 .../vi/data/numbers/teen.tsv                  |  10 +
 .../vi/data/numbers/ties.tsv                  |   8 +
 .../vi/data/numbers/units.tsv                 |   5 +
 .../vi/data/numbers/zero.tsv                  |   1 +
 .../text_normalization/vi/data/whitelist.tsv  |   0
 .../text_normalization/vi/taggers/__init__.py |  13 +
 .../text_normalization/vi/taggers/cardinal.py | 282 ++++++++++++++++++
 .../vi/taggers/punctuation.py                 |  57 ++++
 .../vi/taggers/tokenize_and_classify.py       |  92 ++++++
 .../vi/taggers/whitelist.py                   |  70 +++++
 .../text_normalization/vi/taggers/word.py     |  34 +++
 .../text_normalization/vi/utils.py            |  42 +++
 .../vi/verbalizers/__init__.py                |  13 +
 .../vi/verbalizers/cardinal.py                |  55 ++++
 .../vi/verbalizers/verbalize.py               |  38 +++
 .../vi/verbalizers/verbalize_final.py         |  72 +++++
 .../vi/verbalizers/whitelist.py               |  42 +++
 .../text_normalization/vi/verbalizers/word.py |  37 +++
 .../test_cases_cardinal.txt                   | 107 +++++++
 .../nemo_text_processing/vi/test_cardinal.py  |  45 ++-
 ..._sparrowhawk_inverse_text_normalization.sh |   2 +-
 .../vi/test_sparrowhawk_normalization.sh      |  77 +++++
 .../pynini_export.py                          |   6 +-
 28 files changed, 1119 insertions(+), 17 deletions(-)
 create mode 100644 nemo_text_processing/text_normalization/vi/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/units.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/data/whitelist.tsv
 create mode 100644 nemo_text_processing/text_normalization/vi/taggers/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/vi/taggers/cardinal.py
 create mode 100644 nemo_text_processing/text_normalization/vi/taggers/punctuation.py
 create mode 100644 nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
 create mode 100644 nemo_text_processing/text_normalization/vi/taggers/whitelist.py
 create mode 100644 nemo_text_processing/text_normalization/vi/taggers/word.py
 create mode 100644 nemo_text_processing/text_normalization/vi/utils.py
 create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py
 create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py
 create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py
 create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py
 create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/word.py
 create mode 100644 tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt
 create mode 100644 tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh

diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
index 82f8f43d2..329b28338 100644
--- a/nemo_text_processing/text_normalization/normalize.py
+++ b/nemo_text_processing/text_normalization/normalize.py
@@ -174,6 +174,9 @@ def __init__(
         elif lang == 'ja':
             from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst
             from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst
+        elif lang == 'vi':
+            from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import ClassifyFst
+            from nemo_text_processing.text_normalization.vi.verbalizers.verbalize_final import VerbalizeFinalFst
         else:
             raise NotImplementedError(f"Language {lang} has not been supported yet.")
 
diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py
new file mode 100644
index 000000000..602b8a347
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv
new file mode 100644
index 000000000..573c20bd4
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/digit.tsv
@@ -0,0 +1,9 @@
+1	một
+2	hai
+3	ba
+4	bốn
+5	năm
+6	sáu
+7	bảy
+8	tám
+9	chín
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv
new file mode 100644
index 000000000..919baaf6e
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/digit_special.tsv
@@ -0,0 +1,3 @@
+1	một	mốt
+4	bốn	tư
+5	năm	lăm 
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv
new file mode 100644
index 000000000..8d99f8a69
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/teen.tsv
@@ -0,0 +1,10 @@
+10	mười
+11	mười một
+12	mười hai
+13	mười ba
+14	mười bốn
+15	mười lăm
+16	mười sáu
+17	mười bảy
+18	mười tám
+19	mười chín
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv
new file mode 100644
index 000000000..da88b8ab8
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/ties.tsv
@@ -0,0 +1,8 @@
+2	hai mươi
+3	ba mươi
+4	bốn mươi
+5	năm mươi
+6	sáu mươi
+7	bảy mươi
+8	tám mươi
+9	chín mươi
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv
new file mode 100644
index 000000000..c8a08083c
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv
@@ -0,0 +1,5 @@
+thousand	nghìn
+million	triệu
+billion	tỷ
+hundred	trăm
+linh	linh
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv
new file mode 100644
index 000000000..df062e38c
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/zero.tsv
@@ -0,0 +1 @@
+0	không
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/data/whitelist.tsv b/nemo_text_processing/text_normalization/vi/data/whitelist.tsv
new file mode 100644
index 000000000..e69de29bb
diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py
new file mode 100644
index 000000000..602b8a347
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
new file mode 100644
index 000000000..fdd3cae82
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space
+from nemo_text_processing.text_normalization.vi.utils import get_abs_path
+
+
+def load_data_map(filename):
+    """Load TSV data as pynini string map."""
+    mappings = []
+    with open(get_abs_path(f"data/numbers/{filename}"), 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith('#'):
+                parts = line.split('\t')
+                if len(parts) >= 2:
+                    mappings.append((parts[0], parts[1]))
+    return pynini.string_map(mappings)
+
+
+class CardinalFst(GraphFst):
+    """
+    Simplified Vietnamese cardinal FST using recursive pattern building.
+    Reduced from 700+ lines to ~200 lines while maintaining full functionality.
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
+
+        # Load all basic data maps
+        zero = load_data_map("zero.tsv")
+        digit = load_data_map("digit.tsv") 
+        teen = load_data_map("teen.tsv")
+        ties = load_data_map("ties.tsv")
+        
+        # Load units as dict for easy access
+        units = {}
+        with open(get_abs_path("data/numbers/units.tsv"), 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split('\t')
+                if len(parts) == 2:
+                    units[parts[0]] = parts[1]
+
+        # Load special digits (contextual variants)
+        special = {}
+        with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split('\t')
+                if len(parts) >= 3:
+                    special[parts[0]] = {'std': parts[1], 'alt': parts[2]}
+
+        # Build core patterns
+        single_digit = digit
+        
+        # Special digits for specific contexts (X1, X4, X5 → mốt, tư, lăm)
+        special_1 = pynini.cross("1", special["1"]["alt"])  # mốt
+        special_4 = pynini.cross("4", special["4"]["alt"])  # tư  
+        special_5 = pynini.cross("5", special["5"]["alt"])  # lăm
+        
+        # Linh digits (for 0X patterns) - use standard forms
+        linh_digits = pynini.union(
+            pynini.cross("1", special["1"]["std"]),  # một
+            pynini.cross("4", special["4"]["std"]),  # bốn
+            pynini.cross("5", special["5"]["std"]),  # năm
+            digit
+        )
+        
+        # Two digit patterns
+        two_digit = pynini.union(
+            teen,  # 10-19
+            ties + pynutil.delete("0"),  # 20, 30, etc.
+            ties + insert_space + pynini.union(
+                special_1, special_4, special_5,  # X1, X4, X5 cases
+                pynini.union("2", "3", "6", "7", "8", "9") @ digit  # other digits
+            )
+        )
+
+        # Build hundreds (3 digits: 100-999)
+        hundreds_base = pynini.union(
+            single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("00"),
+            single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("0") 
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            single_digit + insert_space + pynutil.insert(units["hundred"]) + insert_space + two_digit
+        )
+        hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ hundreds_base
+
+        # Build thousands (4-6 digits) with explicit digit constraints
+        # 4-digit thousands (1000-9999)
+        thousands_4d = pynini.union(
+            single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"),
+            single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") 
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") 
+            + insert_space + two_digit,
+            single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base
+        )
+        
+        # 5-digit thousands (10000-99999)
+        thousands_5d = pynini.union(
+            two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"),
+            two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0")
+            + insert_space + two_digit,
+            two_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base
+        )
+        
+        # 6-digit thousands (100000-999999)
+        thousands_6d = pynini.union(
+            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"),
+            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0")
+            + insert_space + two_digit,
+            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base
+        )
+
+        thousands = pynini.union(
+            pynini.closure(NEMO_DIGIT, 6, 6) @ thousands_6d,
+            pynini.closure(NEMO_DIGIT, 5, 5) @ thousands_5d,
+            pynini.closure(NEMO_DIGIT, 4, 4) @ thousands_4d
+        )
+
+        # Build millions (7-9 digits) with explicit patterns to fix precedence
+        # 7-digit millions (1000000-9999999)
+        millions_7d = pynini.union(
+            # Exact millions: 1000000, 2000000, etc.
+            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"),
+            # Millions with linh: 1000001, 1000002, etc.
+            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            # Millions with tens: 1000010, 1000020, etc.
+            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000")
+            + insert_space + two_digit,
+            # Millions with hundreds: 1000100, 1000200, etc.
+            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000")
+            + insert_space + hundreds_base,
+            # Millions with thousands: 5500000 -> năm triệu năm trăm nghìn
+            single_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d,
+            # Complex millions: X001YYY -> X triệu một nghìn YYY (critical fix for 1001001)
+            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00")
+            + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            # Complex millions: X0YZWWW -> X triệu YZ nghìn WWW (critical fix for 1050003)
+            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0")
+            + insert_space + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            # Full millions: X123YZW -> X triệu YZW nghìn/trăm/etc (1050003)
+            single_digit + insert_space + pynutil.insert(units["million"]) + insert_space
+            + pynini.closure(NEMO_DIGIT, 3, 3) @ (
+                pynini.union(
+                    # YZW000 patterns - invalid for 6 digits, skip
+                    # YZ0ABC patterns
+                    two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
+                    + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+                    # YZ0ABC patterns with tens
+                    two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0")
+                    + insert_space + two_digit,
+                    # YYZABC patterns with hundreds  
+                    hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base,
+                    # 0YYZABC patterns (hundreds only)
+                    pynutil.delete("0") + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base,
+                    # 00YABC patterns (tens only) 
+                    pynutil.delete("00") + hundreds_base,
+                    # Y00ABC patterns (single thousand)
+                    single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
+                    + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+                    # YZ00AB patterns (tens of thousands)
+                    two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000")
+                )
+            )
+        )
+        
+        # 8-digit millions (10000000-99999999)
+        millions_8d = pynini.union(
+            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"),
+            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000")
+            + insert_space + two_digit,
+            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000")
+            + insert_space + hundreds_base,
+            two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d,
+            two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d,
+            two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d
+        )
+        
+        # 9-digit millions (100000000-999999999)
+        millions_9d = pynini.union(
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"),
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000")
+            + insert_space + two_digit,
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000")
+            + insert_space + hundreds_base,
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d,
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d,
+            hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d
+        )
+
+        millions = pynini.union(
+            pynini.closure(NEMO_DIGIT, 9, 9) @ millions_9d,
+            pynini.closure(NEMO_DIGIT, 8, 8) @ millions_8d,
+            pynini.closure(NEMO_DIGIT, 7, 7) @ millions_7d
+        )
+
+        # Build billions (10-12 digits) with explicit patterns
+        # 10-digit billions (1000000000-9999999999)
+        billions_10d = pynini.union(
+            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"),
+            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00000000")
+            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
+            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("0000000")
+            + insert_space + two_digit,
+            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000")
+            + insert_space + hundreds_base,
+            # Complex billions: 1001001101 -> một tỷ một triệu một nghìn một trăm linh một
+            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00")
+            + insert_space + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00")
+            + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base,
+            # Full billions with millions
+            single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d,
+            single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d,
+            single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d
+        )
+        
+        # 11-digit billions (10000000000-99999999999)
+        billions_11d = pynini.union(
+            two_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"),
+            two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d,
+            two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d,
+            two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d
+        )
+        
+        # 12-digit billions (100000000000-999999999999)
+        billions_12d = pynini.union(
+            hundreds_base + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"),
+            hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d,
+            hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d,
+            hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d
+        )
+
+        billions = pynini.union(
+            pynini.closure(NEMO_DIGIT, 12, 12) @ billions_12d,
+            pynini.closure(NEMO_DIGIT, 11, 11) @ billions_11d,
+            pynini.closure(NEMO_DIGIT, 10, 10) @ billions_10d
+        )
+
+        # Combine all patterns with proper precedence (longest first)
+        self.graph = pynini.union(
+            billions,     # 10-12 digits
+            millions,     # 7-9 digits  
+            thousands,    # 4-6 digits
+            hundreds,     # 3 digits
+            two_digit,    # 2 digits
+            single_digit, # 1 digit
+            zero         # 0
+        ).optimize()
+
+        # For decimal usage
+        self.single_digits_graph = single_digit | zero
+        self.graph_with_and = self.graph
+
+        # Build final FST with negative handling
+        optional_minus = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
+        final_graph = optional_minus + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
+        self.fst = self.add_tokens(final_graph).optimize()
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
new file mode 100644
index 000000000..f0d20918e
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst
+
+
+class PunctuationFst(GraphFst):
+    """
+    Finite state transducer for classifying punctuation for Vietnamese
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="punctuation", kind="classify", deterministic=deterministic)
+
+        # Common punctuation marks
+        # Use escape() for brackets since they are special regex chars
+        punct = (
+            pynini.accep(".")
+            | pynini.accep(",")
+            | pynini.accep("?")
+            | pynini.accep("!")
+            | pynini.accep(":")
+            | pynini.accep(";")
+            | pynini.accep("-")
+            | pynini.accep("–")
+            | pynini.accep("—")
+            | pynini.accep("(")
+            | pynini.accep(")")
+            | pynini.accep(pynini.escape("["))
+            | pynini.accep(pynini.escape("]"))
+            | pynini.accep(pynini.escape("{"))
+            | pynini.accep(pynini.escape("}"))
+            | pynini.accep('"')
+            | pynini.accep("'")
+            | pynini.accep("...")
+            | pynini.accep("…")
+        )
+
+        # Create the punctuation transduction
+        graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
+
+        final_graph = pynutil.insert("punctuation { ") + graph + pynutil.insert(" }")
+        self.fst = final_graph.optimize()
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
new file mode 100644
index 000000000..52038b14d
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_WHITE_SPACE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst
+from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst
+from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst
+from nemo_text_processing.text_normalization.vi.taggers.word import WordFst
+from nemo_text_processing.utils.logging import logger
+
+
+class ClassifyFst(GraphFst):
+    def __init__(
+        self,
+        input_case: str,
+        deterministic: bool = True,
+        cache_dir: str = None,
+        overwrite_cache: bool = False,
+        whitelist: str = None,
+    ):
+        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, f"vi_tn_{deterministic}_deterministic_{input_case}_tokenize.far",)
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            logger.info(f"ClassifyFst.fst was restored from {far_file}.")
+        else:
+            logger.info(f"Creating Vietnamese ClassifyFst grammars.")
+
+            start_time = time.time()
+            cardinal = CardinalFst(deterministic=deterministic)
+            cardinal_graph = cardinal.fst
+            logger.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes")
+
+            start_time = time.time()
+            punctuation = PunctuationFst(deterministic=deterministic)
+            punct_graph = punctuation.fst
+            logger.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes")
+
+            start_time = time.time()
+            whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic)
+            whitelist_graph = whitelist.fst
+            logger.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes")
+
+            start_time = time.time()
+            word_graph = WordFst(deterministic=deterministic).fst
+            logger.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes")
+
+            classify = (
+                pynutil.add_weight(whitelist_graph, 0.8)  
+                | pynutil.add_weight(cardinal_graph, 0.9) 
+                | pynutil.add_weight(word_graph, 100)  
+            )
+            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            token_plus_punct = (
+                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
+            )
+
+            graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct)
+            graph = delete_space + graph + delete_space
+
+            self.fst = graph.optimize()
+
+            if far_file:
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py
new file mode 100644
index 000000000..5ffd7732e
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
+from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
+
+
+class WhiteListFst(GraphFst):
+    """
+    Finite state transducer for classifying whitelist for Vietnamese, e.g.
+        "h" -> tokens { name: "giờ" }
+        "p" -> tokens { name: "phút" }
+        "s" -> tokens { name: "giây" }
+    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+
+    Args:
+        input_case: accepting either "lower_cased" or "cased" input.
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+        input_file: path to a file with whitelist replacements
+    """
+
+    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
+        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
+
+        def _get_whitelist_graph(input_case, file):
+            whitelist = load_labels(file)
+            if input_case == "lower_cased":
+                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
+            graph = pynini.string_map(whitelist)
+            return graph
+
+        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))
+        if not deterministic and input_case != "lower_cased":
+            graph |= pynutil.add_weight(
+                _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001
+            )
+
+        if input_file:
+            whitelist_provided = _get_whitelist_graph(input_case, input_file)
+            if not deterministic:
+                graph |= whitelist_provided
+            else:
+                graph = whitelist_provided
+
+        # Add time units from time_units.tsv for better time handling
+        if not deterministic:
+            time_units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/time/time_units.tsv"))
+            graph |= time_units_graph
+
+        self.graph = graph
+        self.final_graph = convert_space(self.graph).optimize()
+        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
+
+        # Add tokens wrapper
+        self.fst = self.add_tokens(self.fst) 
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py
new file mode 100644
index 000000000..c3e6d408e
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/taggers/word.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst
+
+
+class WordFst(GraphFst):
+    """
+    Finite state transducer for classifying Vietnamese words.
+        e.g. ngày -> name: "ngày"
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="word", kind="classify", deterministic=deterministic)
+        word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
+        self.fst = word.optimize()
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/utils.py b/nemo_text_processing/text_normalization/vi/utils.py
new file mode 100644
index 000000000..332330921
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/utils.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import os
+
+
+def get_abs_path(rel_path):
+    """
+    Get absolute path
+
+    Args:
+        rel_path: relative path to this file
+
+    Returns absolute path
+    """
+    return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
+
+
+def load_labels(abs_path):
+    """
+    loads relative path file as dictionary
+
+    Args:
+        abs_path: absolute path
+
+    Returns dictionary of mappings
+    """
+    with open(abs_path, encoding="utf-8") as label_tsv:
+        labels = list(csv.reader(label_tsv, delimiter="\t"))
+        return labels
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py
new file mode 100644
index 000000000..602b8a347
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py
new file mode 100644
index 000000000..5ca695673
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing Vietnamese cardinal numbers, e.g.
+        cardinal { negative: "true" integer: "hai mươi ba" } -> âm hai mươi ba
+        cardinal { integer: "một trăm" } -> một trăm
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic)
+
+        # Handle negative sign - Vietnamese uses "âm" for negative numbers
+        self.optional_sign = pynini.cross("negative: \"true\"", "âm ")
+        if not deterministic:
+            # Alternative ways to say negative in Vietnamese
+            self.optional_sign |= pynini.cross("negative: \"true\"", "trừ ")
+            self.optional_sign |= pynini.cross("negative: \"true\"", "âm ")
+
+        self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
+
+        # Handle the integer part
+        integer = pynini.closure(NEMO_NOT_QUOTE)
+
+        self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"")
+        integer = pynutil.delete("integer:") + self.integer
+
+        # Combine negative sign with integer
+        self.numbers = self.optional_sign + integer
+
+        # Delete the token structure and create final FST
+        delete_tokens = self.delete_tokens(self.numbers)
+        self.fst = delete_tokens.optimize()
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py
new file mode 100644
index 000000000..772b2b5f5
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
+from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst
+from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst
+from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst
+
+
+class VerbalizeFst(GraphFst):
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic)
+
+        # Initialize verbalizers
+        cardinal = CardinalFst(deterministic=deterministic)
+        cardinal_graph = cardinal.fst
+        
+        whitelist = WhiteListFst(deterministic=deterministic)
+        whitelist_graph = whitelist.fst
+
+        word = WordFst(deterministic=deterministic)
+        word_graph = word.fst
+
+        # Combine all verbalizers
+        graph = cardinal_graph | whitelist_graph | word_graph
+
+        self.fst = graph
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py
new file mode 100644
index 000000000..a049a5796
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst
+from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst
+from nemo_text_processing.utils.logging import logger
+
+
+class VerbalizeFinalFst(GraphFst):
+    """
+    Finite state transducer that verbalizes an entire Vietnamese sentence, e.g.
+    tokens { name: "xin" } tokens { cardinal { integer: "một trăm" } } tokens { name: "chào" } 
+    -> xin một trăm chào
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+    """
+
+    def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False):
+        super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic)
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, f"vi_tn_{deterministic}_deterministic_verbalizer.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["verbalize"]
+            logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.')
+        else:
+            verbalize = VerbalizeFst(deterministic=deterministic).fst
+            word = WordFst(deterministic=deterministic).fst
+
+            types = verbalize | word
+            graph = (
+                pynutil.delete("tokens")
+                + delete_space
+                + pynutil.delete("{")
+                + delete_space
+                + types
+                + delete_space
+                + pynutil.delete("}")
+            )
+            graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
+
+            self.fst = graph.optimize()
+            if far_file:
+                generator_main(far_file, {"verbalize": self.fst})
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py
new file mode 100644
index 000000000..0b77ee498
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space
+
+
+class WhiteListFst(GraphFst):
+    """
+    Finite state transducer for verbalizing whitelist for Vietnamese
+        e.g. tokens { name: "giờ" } -> giờ
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic)
+        graph = (
+            pynutil.delete("name:")
+            + delete_space
+            + pynutil.delete("\"")
+            + pynini.closure(NEMO_NOT_QUOTE, 1)
+            + pynutil.delete("\"")
+        )
+        graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize() 
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py
new file mode 100644
index 000000000..3ad9a1a82
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space
+
+
+class WordFst(GraphFst):
+    """
+    Finite state transducer for verbalizing Vietnamese words.
+        e.g. tokens { name: "ngày" } -> ngày
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="word", kind="verbalize", deterministic=deterministic)
+        chars = pynini.closure(NEMO_CHAR - " ", 1)
+        char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
+        graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
+
+        self.fst = graph.optimize()
\ No newline at end of file
diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt
new file mode 100644
index 000000000..aad7ae8c1
--- /dev/null
+++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_cardinal.txt
@@ -0,0 +1,107 @@
+1~một
+2~hai
+3~ba
+4~bốn
+5~năm
+6~sáu
+7~bảy
+8~tám
+9~chín
+10~mười
+11~mười một
+12~mười hai
+15~mười lăm
+18~mười tám
+19~mười chín
+20~hai mươi
+21~hai mươi mốt
+25~hai mươi lăm
+30~ba mươi
+34~ba mươi tư
+44~bốn mươi tư
+55~năm mươi lăm
+67~sáu mươi bảy
+70~bảy mươi
+80~tám mươi
+95~chín mươi lăm
+100~một trăm
+101~một trăm linh một
+102~một trăm linh hai
+104~một trăm linh bốn
+105~một trăm linh năm
+110~một trăm mười
+111~một trăm mười một
+120~một trăm hai mươi
+123~một trăm hai mươi ba
+200~hai trăm
+201~hai trăm linh một
+500~năm trăm
+999~chín trăm chín mươi chín
+1000~một nghìn
+1001~một nghìn linh một
+1020~một nghìn hai mươi
+1095~một nghìn chín mươi lăm
+1100~một nghìn một trăm
+2000~hai nghìn
+10000~mười nghìn
+100000~một trăm nghìn
+1000000~một triệu
+2000000~hai triệu
+1000000000~một tỷ
+-1~âm một
+-25~âm hai mươi lăm
+-100~âm một trăm
+-1000~âm một nghìn
+0~không
+1000~một nghìn
+1001~một nghìn linh một
+101~một trăm linh một
+104~một trăm linh bốn
+105~một trăm linh năm
+24~hai mươi tư
+35~ba mươi lăm
+41~bốn mươi mốt
+55~năm mươi lăm
+91~chín mươi mốt
+14~mười bốn
+16~mười sáu
+17~mười bảy
+37~ba mươi bảy
+47~bốn mươi bảy
+57~năm mươi bảy
+63~sáu mươi ba
+79~bảy mươi chín
+84~tám mươi tư
+98~chín mươi tám
+-123~âm một trăm hai mươi ba
+-1001~âm một nghìn linh một
+-104~âm một trăm linh bốn
+1000001~một triệu linh một
+1001001~một triệu một nghìn linh một
+1050003~một triệu năm mươi nghìn linh ba
+1000000001~một tỷ linh một
+1001001101~một tỷ một triệu một nghìn một trăm linh một
+300~ba trăm
+400~bốn trăm
+500~năm trăm
+6000~sáu nghìn
+7000~bảy nghìn
+15000~mười lăm nghìn
+300000~ba trăm nghìn
+450000~bốn trăm năm mươi nghìn
+5000000~năm triệu
+700000000~bảy trăm triệu
+31~ba mươi mốt
+41~bốn mươi mốt
+51~năm mươi mốt
+61~sáu mươi mốt
+71~bảy mươi mốt
+81~tám mươi mốt
+91~chín mươi mốt
+5500000~năm triệu năm trăm nghìn
+1000010~một triệu mười
+1000100~một triệu một trăm
+1000101~một triệu một trăm linh một
+1010001~một triệu mười nghìn linh một
+10000000000~mười tỷ
+150~một trăm năm mươi
\ No newline at end of file
diff --git a/tests/nemo_text_processing/vi/test_cardinal.py b/tests/nemo_text_processing/vi/test_cardinal.py
index 0a888f84b..b745b1e09 100644
--- a/tests/nemo_text_processing/vi/test_cardinal.py
+++ b/tests/nemo_text_processing/vi/test_cardinal.py
@@ -12,32 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# pytest tests/nemo_text_processing/vi/test_cardinal.py --cpu --cache-clear
 import pytest
 from parameterized import parameterized
 
-from ..utils import CACHE_DIR, parse_test_case_file
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from nemo_text_processing.text_normalization.normalize import Normalizer
+from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio
 
-try:
-    from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
-
-    PYNINI_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    PYNINI_AVAILABLE = False
+from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file
 
 
 class TestCardinal:
 
-    inverse_normalizer = (
-        InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
-    )
-
+    inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) 
+    
     @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_cardinal.txt'))
-    @pytest.mark.skipif(
-        not PYNINI_AVAILABLE,
-        reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh",
-    )
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_denorm(self, test_input, expected):
         pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
         assert pred == expected
+
+    normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True)
+    
+    normalizer_with_audio = (
+        NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False)
+        if CACHE_DIR and RUN_AUDIO_BASED_TESTS
+        else None
+    )
+
+    @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_cardinal.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_norm(self, test_input, expected):
+        pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False)
+        assert pred == expected, f"input: {test_input}"
+
+        if self.normalizer_with_audio:
+            pred_non_deterministic = self.normalizer_with_audio.normalize(
+                test_input,
+                n_tagged=30,
+                punct_post_process=False,
+            )
+            assert expected in pred_non_deterministic, f"input: {test_input}"
\ No newline at end of file
diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh
index 751351cd4..684eb3b22 100644
--- a/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh
+++ b/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh
@@ -1,7 +1,7 @@
 #! /bin/sh
 
 GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
-PROJECT_DIR=${2:-"/workspace/tests/en"}
+PROJECT_DIR=${2:-"/workspace/tests"}
 
 runtest () {
   input=$1
diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh
new file mode 100644
index 000000000..d230b4642
--- /dev/null
+++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh
@@ -0,0 +1,77 @@
+
+#! /bin/sh
+
+GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
+PROJECT_DIR=${2:-"/workspace/tests"}
+
+runtest () {
+  input=$1
+  echo "INPUT is $input"
+  cd ${GRAMMARS_DIR}
+
+  # read test file
+  while read testcase; do
+    IFS='~' read written spoken <<< $testcase
+    norm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1)
+
+    # trim white space
+    spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
+    norm_pred="$(echo -e "${norm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
+
+    # input expected actual
+    assertEquals "$written" "$spoken" "$norm_pred"
+  done < "$input"
+}
+
+testTNCardinal() {
+  input=$PROJECT_DIR/vi/data_text_normalization/test_cases_cardinal.txt
+  runtest $input
+}
+
+# testTNDate() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_date.txt
+#   runtest $input
+# }
+
+# testTNDecimal() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_decimal.txt
+#   runtest $input
+# }
+
+# testTNOrdinal() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_ordinal.txt
+#   runtest $input
+# }
+
+# testTNFraction() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_fraction.txt
+#   runtest $input
+# }
+
+# testTNTime() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_time.txt
+#   runtest $input
+# }
+
+# testTNMeasure() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_measure.txt
+#   runtest $input
+# }
+
+# testTNMoney() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_money.txt
+#   runtest $input
+# }
+
+# testTNTelephone() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_telephone.txt
+#   runtest $input
+# }
+
+# testTNElectronic() {
+#   input=$PROJECT_DIR/vi/data_text_normalization/test_cases_electronic.txt
+#   runtest $input
+# }
+
+# Load shUnit2
+. /workspace/shunit2/shunit2 
diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py
index 6b82dfbec..bc19f428d 100644
--- a/tools/text_processing_deployment/pynini_export.py
+++ b/tools/text_processing_deployment/pynini_export.py
@@ -137,7 +137,7 @@ def parse_args():
 if __name__ == '__main__':
     args = parse_args()
 
-    if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars':
+    if args.language in ['pt', 'ru', 'es_en', 'mr'] and args.grammars == 'tn_grammars':
         raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.')
     TNPostProcessingFst = None
     ITNPostProcessingFst = None
@@ -240,6 +240,10 @@ def parse_args():
         from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize import (
             VerbalizeFst as ITNVerbalizeFst,
         )
+        from nemo_text_processing.text_normalization.vi.taggers.tokenize_and_classify import (
+            ClassifyFst as TNClassifyFst,
+        )
+        from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
     elif args.language == 'zh':
         from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (
             ClassifyFst as ITNClassifyFst,

From 2df93bc76f2400496ed58e792fcbcfa6cac1e633 Mon Sep 17 00:00:00 2001
From: folivoramanh <palasek182@gmail.com>
Date: Thu, 12 Jun 2025 12:42:36 +0700
Subject: [PATCH 2/4] Add missing init file

Signed-off-by: folivoramanh <palasek182@gmail.com>
---
 .../text_normalization/vi/data/__init__.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 nemo_text_processing/text_normalization/vi/data/__init__.py

diff --git a/nemo_text_processing/text_normalization/vi/data/__init__.py b/nemo_text_processing/text_normalization/vi/data/__init__.py
new file mode 100644
index 000000000..6ebc808fa
--- /dev/null
+++ b/nemo_text_processing/text_normalization/vi/data/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 109d071d326e97bb47233d67b73762ce7744e87c Mon Sep 17 00:00:00 2001
From: folivoramanh <palasek182@gmail.com>
Date: Thu, 19 Jun 2025 00:00:34 +0700
Subject: [PATCH 3/4] Fix Cardinal and optimize logic

Signed-off-by: folivoramanh <palasek182@gmail.com>
---
 Jenkinsfile                                   |   7 +-
 .../vi/data/numbers/__init__.py               |  13 +
 .../numbers/{units.tsv => magnitudes.tsv}     |   0
 .../text_normalization/vi/taggers/cardinal.py | 339 +++++-------------
 .../vi/taggers/punctuation.py                 |  25 +-
 .../vi/taggers/tokenize_and_classify.py       |   1 -
 6 files changed, 119 insertions(+), 266 deletions(-)
 rename nemo_text_processing/text_normalization/vi/data/numbers/{units.tsv => magnitudes.tsv} (100%)

diff --git a/Jenkinsfile b/Jenkinsfile
index 51ce37a10..2f9ca394d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -176,7 +176,7 @@ pipeline {
       }
     }
 
-    stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') {
+    stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') {
       when {
         anyOf {
           branch 'main'
@@ -200,6 +200,11 @@ pipeline {
             sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}'
           }
         }
+        stage('L0: VI TN grammars') {
+          steps {
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}'
+          }
+        }
         stage('L0: HU TN grammars') {
          steps {
             sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}'
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py
index e69de29bb..6ebc808fa 100644
--- a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py
+++ b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/units.tsv b/nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv
similarity index 100%
rename from nemo_text_processing/text_normalization/vi/data/numbers/units.tsv
rename to nemo_text_processing/text_normalization/vi/data/numbers/magnitudes.tsv
diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
index fdd3cae82..99fa76acd 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
@@ -19,264 +19,119 @@
 from nemo_text_processing.text_normalization.vi.utils import get_abs_path
 
 
-def load_data_map(filename):
-    """Load TSV data as pynini string map."""
-    mappings = []
-    with open(get_abs_path(f"data/numbers/{filename}"), 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if line and not line.startswith('#'):
-                parts = line.split('\t')
-                if len(parts) >= 2:
-                    mappings.append((parts[0], parts[1]))
-    return pynini.string_map(mappings)
-
-
 class CardinalFst(GraphFst):
-    """
-    Simplified Vietnamese cardinal FST using recursive pattern building.
-    Reduced from 700+ lines to ~200 lines while maintaining full functionality.
-    """
-
     def __init__(self, deterministic: bool = True):
         super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
-
-        # Load all basic data maps
-        zero = load_data_map("zero.tsv")
-        digit = load_data_map("digit.tsv") 
-        teen = load_data_map("teen.tsv")
-        ties = load_data_map("ties.tsv")
         
-        # Load units as dict for easy access
-        units = {}
-        with open(get_abs_path("data/numbers/units.tsv"), 'r', encoding='utf-8') as f:
-            for line in f:
-                parts = line.strip().split('\t')
-                if len(parts) == 2:
-                    units[parts[0]] = parts[1]
-
-        # Load special digits (contextual variants)
-        special = {}
-        with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f:
-            for line in f:
-                parts = line.strip().split('\t')
-                if len(parts) >= 3:
-                    special[parts[0]] = {'std': parts[1], 'alt': parts[2]}
-
-        # Build core patterns
-        single_digit = digit
+        resources = {
+            'zero': pynini.string_file(get_abs_path("data/numbers/zero.tsv")),
+            'digit': pynini.string_file(get_abs_path("data/numbers/digit.tsv")),
+            'teen': pynini.string_file(get_abs_path("data/numbers/teen.tsv")),
+            'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
+        }
+        self.zero, self.digit, self.teen, self.ties = resources.values()
         
-        # Special digits for specific contexts (X1, X4, X5 → mốt, tư, lăm)
-        special_1 = pynini.cross("1", special["1"]["alt"])  # mốt
-        special_4 = pynini.cross("4", special["4"]["alt"])  # tư  
-        special_5 = pynini.cross("5", special["5"]["alt"])  # lăm
+        with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f:
+            self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2}
         
-        # Linh digits (for 0X patterns) - use standard forms
-        linh_digits = pynini.union(
-            pynini.cross("1", special["1"]["std"]),  # một
-            pynini.cross("4", special["4"]["std"]),  # bốn
-            pynini.cross("5", special["5"]["std"]),  # năm
-            digit
-        )
+        with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f:
+            special = {parts[0]: {'std': parts[1], 'alt': parts[2]} for line in f 
+                      if len(parts := line.strip().split('\t')) >= 3}
         
-        # Two digit patterns
-        two_digit = pynini.union(
-            teen,  # 10-19
-            ties + pynutil.delete("0"),  # 20, 30, etc.
-            ties + insert_space + pynini.union(
-                special_1, special_4, special_5,  # X1, X4, X5 cases
-                pynini.union("2", "3", "6", "7", "8", "9") @ digit  # other digits
-            )
-        )
-
-        # Build hundreds (3 digits: 100-999)
-        hundreds_base = pynini.union(
-            single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("00"),
-            single_digit + insert_space + pynutil.insert(units["hundred"]) + pynutil.delete("0") 
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            single_digit + insert_space + pynutil.insert(units["hundred"]) + insert_space + two_digit
-        )
-        hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ hundreds_base
-
-        # Build thousands (4-6 digits) with explicit digit constraints
-        # 4-digit thousands (1000-9999)
-        thousands_4d = pynini.union(
-            single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"),
-            single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00") 
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0") 
-            + insert_space + two_digit,
-            single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base
-        )
+        self.special_digits = pynini.union(*[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]])
+        self.linh_digits = pynini.union(*[pynini.cross(k, special[k]["std"]) for k in ["1", "4", "5"]], self.digit)
         
-        # 5-digit thousands (10000-99999)
-        thousands_5d = pynini.union(
-            two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"),
-            two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0")
-            + insert_space + two_digit,
-            two_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base
-        )
+        self.single_digit = self.digit
         
-        # 6-digit thousands (100000-999999)
-        thousands_6d = pynini.union(
-            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000"),
-            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0")
-            + insert_space + two_digit,
-            hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base
-        )
-
-        thousands = pynini.union(
-            pynini.closure(NEMO_DIGIT, 6, 6) @ thousands_6d,
-            pynini.closure(NEMO_DIGIT, 5, 5) @ thousands_5d,
-            pynini.closure(NEMO_DIGIT, 4, 4) @ thousands_4d
-        )
-
-        # Build millions (7-9 digits) with explicit patterns to fix precedence
-        # 7-digit millions (1000000-9999999)
-        millions_7d = pynini.union(
-            # Exact millions: 1000000, 2000000, etc.
-            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"),
-            # Millions with linh: 1000001, 1000002, etc.
-            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            # Millions with tens: 1000010, 1000020, etc.
-            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000")
-            + insert_space + two_digit,
-            # Millions with hundreds: 1000100, 1000200, etc.
-            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000")
-            + insert_space + hundreds_base,
-            # Millions with thousands: 5500000 -> năm triệu năm trăm nghìn
-            single_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d,
-            # Complex millions: X001YYY -> X triệu một nghìn YYY (critical fix for 1001001)
-            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00")
-            + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            # Complex millions: X0YZWWW -> X triệu YZ nghìn WWW (critical fix for 1050003)
-            single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0")
-            + insert_space + two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            # Full millions: X123YZW -> X triệu YZW nghìn/trăm/etc (1050003)
-            single_digit + insert_space + pynutil.insert(units["million"]) + insert_space
-            + pynini.closure(NEMO_DIGIT, 3, 3) @ (
-                pynini.union(
-                    # YZW000 patterns - invalid for 6 digits, skip
-                    # YZ0ABC patterns
-                    two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
-                    + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-                    # YZ0ABC patterns with tens
-                    two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("0")
-                    + insert_space + two_digit,
-                    # YYZABC patterns with hundreds  
-                    hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base,
-                    # 0YYZABC patterns (hundreds only)
-                    pynutil.delete("0") + hundreds_base + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base,
-                    # 00YABC patterns (tens only) 
-                    pynutil.delete("00") + hundreds_base,
-                    # Y00ABC patterns (single thousand)
-                    single_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("00")
-                    + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-                    # YZ00AB patterns (tens of thousands)
-                    two_digit + insert_space + pynutil.insert(units["thousand"]) + pynutil.delete("000")
-                )
+        self.two_digit = pynini.union(
+            self.teen,
+            self.ties + pynutil.delete("0"),
+            self.ties + insert_space + pynini.union(
+                self.special_digits,
+                pynini.union("2", "3", "6", "7", "8", "9") @ self.digit
             )
         )
         
-        # 8-digit millions (10000000-99999999)
-        millions_8d = pynini.union(
-            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"),
-            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000")
-            + insert_space + two_digit,
-            two_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000")
-            + insert_space + hundreds_base,
-            two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d,
-            two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d,
-            two_digit + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d
+        self.hundreds_pattern = pynini.union(
+            self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("00"),
+            
+            self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("0") 
+            + insert_space + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits,
+            
+            self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + insert_space + self.two_digit
         )
         
-        # 9-digit millions (100000000-999999999)
-        millions_9d = pynini.union(
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000000"),
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00000")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("0000")
-            + insert_space + two_digit,
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + pynutil.delete("000")
-            + insert_space + hundreds_base,
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_4d,
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_5d,
-            hundreds_base + insert_space + pynutil.insert(units["million"]) + insert_space + thousands_6d
-        )
-
-        millions = pynini.union(
-            pynini.closure(NEMO_DIGIT, 9, 9) @ millions_9d,
-            pynini.closure(NEMO_DIGIT, 8, 8) @ millions_8d,
-            pynini.closure(NEMO_DIGIT, 7, 7) @ millions_7d
-        )
-
-        # Build billions (10-12 digits) with explicit patterns
-        # 10-digit billions (1000000000-9999999999)
-        billions_10d = pynini.union(
-            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"),
-            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00000000")
-            + insert_space + pynutil.insert(units["linh"]) + insert_space + linh_digits,
-            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("0000000")
-            + insert_space + two_digit,
-            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000")
-            + insert_space + hundreds_base,
-            # Complex billions: 1001001101 -> một tỷ một triệu một nghìn một trăm linh một
-            single_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("00")
-            + insert_space + single_digit + insert_space + pynutil.insert(units["million"]) + pynutil.delete("00")
-            + insert_space + single_digit + insert_space + pynutil.insert(units["thousand"]) + insert_space + hundreds_base,
-            # Full billions with millions
-            single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d,
-            single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d,
-            single_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d
-        )
+        self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern
         
-        # 11-digit billions (10000000000-99999999999)
-        billions_11d = pynini.union(
-            two_digit + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"),
-            two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d,
-            two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d,
-            two_digit + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d
-        )
+        self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3)
+        self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand)
+        self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million)
         
-        # 12-digit billions (100000000000-999999999999)
-        billions_12d = pynini.union(
-            hundreds_base + insert_space + pynutil.insert(units["billion"]) + pynutil.delete("000000000"),
-            hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_7d,
-            hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_8d,
-            hundreds_base + insert_space + pynutil.insert(units["billion"]) + insert_space + millions_9d
-        )
-
-        billions = pynini.union(
-            pynini.closure(NEMO_DIGIT, 12, 12) @ billions_12d,
-            pynini.closure(NEMO_DIGIT, 11, 11) @ billions_11d,
-            pynini.closure(NEMO_DIGIT, 10, 10) @ billions_10d
-        )
-
-        # Combine all patterns with proper precedence (longest first)
         self.graph = pynini.union(
-            billions,     # 10-12 digits
-            millions,     # 7-9 digits  
-            thousands,    # 4-6 digits
-            hundreds,     # 3 digits
-            two_digit,    # 2 digits
-            single_digit, # 1 digit
-            zero         # 0
+            self.billion, self.million, self.thousand, self.hundreds, 
+            self.two_digit, self.single_digit, self.zero
         ).optimize()
-
-        # For decimal usage
-        self.single_digits_graph = single_digit | zero
+        
+        self.single_digits_graph = self.single_digit | self.zero
         self.graph_with_and = self.graph
-
-        # Build final FST with negative handling
-        optional_minus = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
-        final_graph = optional_minus + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
-        self.fst = self.add_tokens(final_graph).optimize()
\ No newline at end of file
+        
+        self.fst = self.add_tokens(
+            pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + 
+            pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
+        ).optimize()
+    
+    def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None):
+        magnitude_word = self.magnitudes[name]
+        
+        patterns = []
+        for digits in range(min_digits, max_digits + 1):
+            leading_digits = digits - zero_count
+            leading_fst = {1: self.single_digit, 2: self.two_digit, 3: self.hundreds_pattern}.get(
+                leading_digits, self.hundreds_pattern)
+            
+            prefix = leading_fst + insert_space + pynutil.insert(magnitude_word)
+            
+            digit_patterns = [prefix + pynutil.delete("0" * zero_count)]
+            
+            if prev_pattern:
+                digit_patterns.append(prefix + insert_space + prev_pattern)
+
+            trailing_patterns = []
+            for trailing_zeros in range(zero_count):
+                remaining_digits = zero_count - trailing_zeros
+                if remaining_digits == 1:
+                    trailing_patterns.append(
+                        prefix + pynutil.delete("0" * trailing_zeros) + insert_space + 
+                        pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits
+                    )
+                elif remaining_digits == 2:
+                    trailing_patterns.append(
+                        prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.two_digit
+                    )
+                elif remaining_digits == 3:
+                    trailing_patterns.append(
+                        prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.hundreds_pattern
+                    )
+            digit_patterns.extend(trailing_patterns)
+            
+            if name == "million" and digits == 7:
+                digit_patterns.extend([
+                    prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + 
+                    pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + 
+                    pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits,
+                    
+                    prefix + pynutil.delete("0") + insert_space + self.two_digit + insert_space + 
+                    pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + 
+                    pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits
+                ])
+            elif name == "billion" and digits == 10:
+                digit_patterns.append(
+                    prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + 
+                    pynutil.insert(self.magnitudes["million"]) + pynutil.delete("00") + insert_space + 
+                    self.single_digit + insert_space + pynutil.insert(self.magnitudes["thousand"]) + 
+                    insert_space + self.hundreds_pattern
+                )
+            
+            patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns))
+        
+        return pynini.union(*patterns)
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
index f0d20918e..044c6494c 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
@@ -15,7 +15,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst
+from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
 
 
 class PunctuationFst(GraphFst):
@@ -28,27 +28,8 @@ def __init__(self, deterministic: bool = True):
 
         # Common punctuation marks
         # Use escape() for brackets since they are special regex chars
-        punct = (
-            pynini.accep(".")
-            | pynini.accep(",")
-            | pynini.accep("?")
-            | pynini.accep("!")
-            | pynini.accep(":")
-            | pynini.accep(";")
-            | pynini.accep("-")
-            | pynini.accep("–")
-            | pynini.accep("—")
-            | pynini.accep("(")
-            | pynini.accep(")")
-            | pynini.accep(pynini.escape("["))
-            | pynini.accep(pynini.escape("]"))
-            | pynini.accep(pynini.escape("{"))
-            | pynini.accep(pynini.escape("}"))
-            | pynini.accep('"')
-            | pynini.accep("'")
-            | pynini.accep("...")
-            | pynini.accep("…")
-        )
+        s = "!#$%&'()*+,-./:;<=>?@^_`{|}~–—――…»«„“›‹‚‘’⟨⟩"
+        punct = pynini.union(*s)
 
         # Create the punctuation transduction
         graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
index 52038b14d..004e51c9d 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
@@ -19,7 +19,6 @@
 from pynini.lib import pynutil
 
 from nemo_text_processing.text_normalization.en.graph_utils import (
-    NEMO_WHITE_SPACE,
     GraphFst,
     delete_extra_space,
     delete_space,

From d751b36114ef08c9ef6a30b23a81e2cf54b58d30 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 18 Jun 2025 17:13:25 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../text_normalization/vi/__init__.py         |   2 +-
 .../text_normalization/vi/taggers/__init__.py |   2 +-
 .../text_normalization/vi/taggers/cardinal.py | 152 +++++++++++-------
 .../vi/taggers/punctuation.py                 |   2 +-
 .../vi/taggers/tokenize_and_classify.py       |  13 +-
 .../vi/taggers/whitelist.py                   |   4 +-
 .../text_normalization/vi/taggers/word.py     |   2 +-
 .../vi/verbalizers/__init__.py                |   2 +-
 .../vi/verbalizers/cardinal.py                |   2 +-
 .../vi/verbalizers/verbalize.py               |   4 +-
 .../vi/verbalizers/verbalize_final.py         |   4 +-
 .../vi/verbalizers/whitelist.py               |   2 +-
 .../text_normalization/vi/verbalizers/word.py |   2 +-
 .../nemo_text_processing/vi/test_cardinal.py  |  12 +-
 14 files changed, 127 insertions(+), 78 deletions(-)

diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py
index 602b8a347..bc443be41 100644
--- a/nemo_text_processing/text_normalization/vi/__init__.py
+++ b/nemo_text_processing/text_normalization/vi/__init__.py
@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py
index 602b8a347..bc443be41 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/__init__.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py
@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
index 99fa76acd..fa0f04fad 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py
@@ -22,77 +22,91 @@
 class CardinalFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
-        
+
         resources = {
             'zero': pynini.string_file(get_abs_path("data/numbers/zero.tsv")),
             'digit': pynini.string_file(get_abs_path("data/numbers/digit.tsv")),
             'teen': pynini.string_file(get_abs_path("data/numbers/teen.tsv")),
-            'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
+            'ties': pynini.string_file(get_abs_path("data/numbers/ties.tsv")),
         }
         self.zero, self.digit, self.teen, self.ties = resources.values()
-        
+
         with open(get_abs_path("data/numbers/magnitudes.tsv"), 'r', encoding='utf-8') as f:
             self.magnitudes = {parts[0]: parts[1] for line in f if len(parts := line.strip().split('\t')) == 2}
-        
+
         with open(get_abs_path("data/numbers/digit_special.tsv"), 'r', encoding='utf-8') as f:
-            special = {parts[0]: {'std': parts[1], 'alt': parts[2]} for line in f 
-                      if len(parts := line.strip().split('\t')) >= 3}
-        
-        self.special_digits = pynini.union(*[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]])
+            special = {
+                parts[0]: {'std': parts[1], 'alt': parts[2]}
+                for line in f
+                if len(parts := line.strip().split('\t')) >= 3
+            }
+
+        self.special_digits = pynini.union(
+            *[pynini.cross(k, v["alt"]) for k, v in special.items() if k in ["1", "4", "5"]]
+        )
         self.linh_digits = pynini.union(*[pynini.cross(k, special[k]["std"]) for k in ["1", "4", "5"]], self.digit)
-        
+
         self.single_digit = self.digit
-        
+
         self.two_digit = pynini.union(
             self.teen,
             self.ties + pynutil.delete("0"),
-            self.ties + insert_space + pynini.union(
-                self.special_digits,
-                pynini.union("2", "3", "6", "7", "8", "9") @ self.digit
-            )
+            self.ties
+            + insert_space
+            + pynini.union(self.special_digits, pynini.union("2", "3", "6", "7", "8", "9") @ self.digit),
         )
-        
+
         self.hundreds_pattern = pynini.union(
             self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("00"),
-            
-            self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + pynutil.delete("0") 
-            + insert_space + pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits,
-            
-            self.single_digit + insert_space + pynutil.insert(self.magnitudes["hundred"]) + insert_space + self.two_digit
+            self.single_digit
+            + insert_space
+            + pynutil.insert(self.magnitudes["hundred"])
+            + pynutil.delete("0")
+            + insert_space
+            + pynutil.insert(self.magnitudes["linh"])
+            + insert_space
+            + self.linh_digits,
+            self.single_digit
+            + insert_space
+            + pynutil.insert(self.magnitudes["hundred"])
+            + insert_space
+            + self.two_digit,
         )
-        
+
         self.hundreds = pynini.closure(NEMO_DIGIT, 3, 3) @ self.hundreds_pattern
-        
+
         self.thousand = self._build_magnitude_pattern("thousand", 4, 6, 3)
         self.million = self._build_magnitude_pattern("million", 7, 9, 6, self.thousand)
         self.billion = self._build_magnitude_pattern("billion", 10, 12, 9, self.million)
-        
+
         self.graph = pynini.union(
-            self.billion, self.million, self.thousand, self.hundreds, 
-            self.two_digit, self.single_digit, self.zero
+            self.billion, self.million, self.thousand, self.hundreds, self.two_digit, self.single_digit, self.zero
         ).optimize()
-        
+
         self.single_digits_graph = self.single_digit | self.zero
         self.graph_with_and = self.graph
-        
+
         self.fst = self.add_tokens(
-            pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + 
-            pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
+            pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
+            + pynutil.insert("integer: \"")
+            + self.graph
+            + pynutil.insert("\"")
         ).optimize()
-    
+
     def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, prev_pattern=None):
         magnitude_word = self.magnitudes[name]
-        
+
         patterns = []
         for digits in range(min_digits, max_digits + 1):
             leading_digits = digits - zero_count
             leading_fst = {1: self.single_digit, 2: self.two_digit, 3: self.hundreds_pattern}.get(
-                leading_digits, self.hundreds_pattern)
-            
+                leading_digits, self.hundreds_pattern
+            )
+
             prefix = leading_fst + insert_space + pynutil.insert(magnitude_word)
-            
+
             digit_patterns = [prefix + pynutil.delete("0" * zero_count)]
-            
+
             if prev_pattern:
                 digit_patterns.append(prefix + insert_space + prev_pattern)
 
@@ -101,8 +115,12 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre
                 remaining_digits = zero_count - trailing_zeros
                 if remaining_digits == 1:
                     trailing_patterns.append(
-                        prefix + pynutil.delete("0" * trailing_zeros) + insert_space + 
-                        pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits
+                        prefix
+                        + pynutil.delete("0" * trailing_zeros)
+                        + insert_space
+                        + pynutil.insert(self.magnitudes["linh"])
+                        + insert_space
+                        + self.linh_digits
                     )
                 elif remaining_digits == 2:
                     trailing_patterns.append(
@@ -113,25 +131,51 @@ def _build_magnitude_pattern(self, name, min_digits, max_digits, zero_count, pre
                         prefix + pynutil.delete("0" * trailing_zeros) + insert_space + self.hundreds_pattern
                     )
             digit_patterns.extend(trailing_patterns)
-            
+
             if name == "million" and digits == 7:
-                digit_patterns.extend([
-                    prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + 
-                    pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + 
-                    pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits,
-                    
-                    prefix + pynutil.delete("0") + insert_space + self.two_digit + insert_space + 
-                    pynutil.insert(self.magnitudes["thousand"]) + pynutil.delete("00") + insert_space + 
-                    pynutil.insert(self.magnitudes["linh"]) + insert_space + self.linh_digits
-                ])
+                digit_patterns.extend(
+                    [
+                        prefix
+                        + pynutil.delete("00")
+                        + insert_space
+                        + self.single_digit
+                        + insert_space
+                        + pynutil.insert(self.magnitudes["thousand"])
+                        + pynutil.delete("00")
+                        + insert_space
+                        + pynutil.insert(self.magnitudes["linh"])
+                        + insert_space
+                        + self.linh_digits,
+                        prefix
+                        + pynutil.delete("0")
+                        + insert_space
+                        + self.two_digit
+                        + insert_space
+                        + pynutil.insert(self.magnitudes["thousand"])
+                        + pynutil.delete("00")
+                        + insert_space
+                        + pynutil.insert(self.magnitudes["linh"])
+                        + insert_space
+                        + self.linh_digits,
+                    ]
+                )
             elif name == "billion" and digits == 10:
                 digit_patterns.append(
-                    prefix + pynutil.delete("00") + insert_space + self.single_digit + insert_space + 
-                    pynutil.insert(self.magnitudes["million"]) + pynutil.delete("00") + insert_space + 
-                    self.single_digit + insert_space + pynutil.insert(self.magnitudes["thousand"]) + 
-                    insert_space + self.hundreds_pattern
+                    prefix
+                    + pynutil.delete("00")
+                    + insert_space
+                    + self.single_digit
+                    + insert_space
+                    + pynutil.insert(self.magnitudes["million"])
+                    + pynutil.delete("00")
+                    + insert_space
+                    + self.single_digit
+                    + insert_space
+                    + pynutil.insert(self.magnitudes["thousand"])
+                    + insert_space
+                    + self.hundreds_pattern
                 )
-            
+
             patterns.append(pynini.closure(NEMO_DIGIT, digits, digits) @ pynini.union(*digit_patterns))
-        
-        return pynini.union(*patterns)
\ No newline at end of file
+
+        return pynini.union(*patterns)
diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
index 044c6494c..1e08cb02d 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py
@@ -35,4 +35,4 @@ def __init__(self, deterministic: bool = True):
         graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
 
         final_graph = pynutil.insert("punctuation { ") + graph + pynutil.insert(" }")
-        self.fst = final_graph.optimize()
\ No newline at end of file
+        self.fst = final_graph.optimize()
diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
index 004e51c9d..7c46c786a 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py
@@ -45,7 +45,10 @@ def __init__(
         far_file = None
         if cache_dir is not None and cache_dir != "None":
             os.makedirs(cache_dir, exist_ok=True)
-            far_file = os.path.join(cache_dir, f"vi_tn_{deterministic}_deterministic_{input_case}_tokenize.far",)
+            far_file = os.path.join(
+                cache_dir,
+                f"vi_tn_{deterministic}_deterministic_{input_case}_tokenize.far",
+            )
         if not overwrite_cache and far_file and os.path.exists(far_file):
             self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
             logger.info(f"ClassifyFst.fst was restored from {far_file}.")
@@ -72,9 +75,9 @@ def __init__(
             logger.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes")
 
             classify = (
-                pynutil.add_weight(whitelist_graph, 0.8)  
-                | pynutil.add_weight(cardinal_graph, 0.9) 
-                | pynutil.add_weight(word_graph, 100)  
+                pynutil.add_weight(whitelist_graph, 0.8)
+                | pynutil.add_weight(cardinal_graph, 0.9)
+                | pynutil.add_weight(word_graph, 100)
             )
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
@@ -88,4 +91,4 @@ def __init__(
             self.fst = graph.optimize()
 
             if far_file:
-                generator_main(far_file, {"tokenize_and_classify": self.fst})
\ No newline at end of file
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py
index 5ffd7732e..aed5e356a 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py
@@ -15,8 +15,8 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
 from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
+from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
 
 
 class WhiteListFst(GraphFst):
@@ -67,4 +67,4 @@ def _get_whitelist_graph(input_case, file):
         self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
 
         # Add tokens wrapper
-        self.fst = self.add_tokens(self.fst) 
\ No newline at end of file
+        self.fst = self.add_tokens(self.fst)
diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py
index c3e6d408e..f0be213c7 100644
--- a/nemo_text_processing/text_normalization/vi/taggers/word.py
+++ b/nemo_text_processing/text_normalization/vi/taggers/word.py
@@ -31,4 +31,4 @@ class WordFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="word", kind="classify", deterministic=deterministic)
         word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
-        self.fst = word.optimize()
\ No newline at end of file
+        self.fst = word.optimize()
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py
index 602b8a347..bc443be41 100644
--- a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py
@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py
index 5ca695673..530c3dfce 100644
--- a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py
@@ -52,4 +52,4 @@ def __init__(self, deterministic: bool = True):
 
         # Delete the token structure and create final FST
         delete_tokens = self.delete_tokens(self.numbers)
-        self.fst = delete_tokens.optimize()
\ No newline at end of file
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py
index 772b2b5f5..fff63933e 100644
--- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py
@@ -25,7 +25,7 @@ def __init__(self, deterministic: bool = True):
         # Initialize verbalizers
         cardinal = CardinalFst(deterministic=deterministic)
         cardinal_graph = cardinal.fst
-        
+
         whitelist = WhiteListFst(deterministic=deterministic)
         whitelist_graph = whitelist.fst
 
@@ -35,4 +35,4 @@ def __init__(self, deterministic: bool = True):
         # Combine all verbalizers
         graph = cardinal_graph | whitelist_graph | word_graph
 
-        self.fst = graph
\ No newline at end of file
+        self.fst = graph
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py
index a049a5796..cd9ec39eb 100644
--- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py
@@ -31,7 +31,7 @@
 class VerbalizeFinalFst(GraphFst):
     """
     Finite state transducer that verbalizes an entire Vietnamese sentence, e.g.
-    tokens { name: "xin" } tokens { cardinal { integer: "một trăm" } } tokens { name: "chào" } 
+    tokens { name: "xin" } tokens { cardinal { integer: "một trăm" } } tokens { name: "chào" }
     -> xin một trăm chào
 
     Args:
@@ -69,4 +69,4 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_
 
             self.fst = graph.optimize()
             if far_file:
-                generator_main(far_file, {"verbalize": self.fst})
\ No newline at end of file
+                generator_main(far_file, {"verbalize": self.fst})
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py
index 0b77ee498..6e0699827 100644
--- a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py
@@ -39,4 +39,4 @@ def __init__(self, deterministic: bool = True):
         )
         graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
         delete_tokens = self.delete_tokens(graph)
-        self.fst = delete_tokens.optimize() 
\ No newline at end of file
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py
index 3ad9a1a82..f9547acba 100644
--- a/nemo_text_processing/text_normalization/vi/verbalizers/word.py
+++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py
@@ -34,4 +34,4 @@ def __init__(self, deterministic: bool = True):
         char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
         graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
 
-        self.fst = graph.optimize()
\ No newline at end of file
+        self.fst = graph.optimize()
diff --git a/tests/nemo_text_processing/vi/test_cardinal.py b/tests/nemo_text_processing/vi/test_cardinal.py
index b745b1e09..636932aed 100644
--- a/tests/nemo_text_processing/vi/test_cardinal.py
+++ b/tests/nemo_text_processing/vi/test_cardinal.py
@@ -25,8 +25,8 @@
 
 class TestCardinal:
 
-    inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) 
-    
+    inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False)
+
     @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_cardinal.txt'))
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
@@ -34,8 +34,10 @@ def test_denorm(self, test_input, expected):
         pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
         assert pred == expected
 
-    normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True)
-    
+    normalizer = Normalizer(
+        input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True
+    )
+
     normalizer_with_audio = (
         NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False)
         if CACHE_DIR and RUN_AUDIO_BASED_TESTS
@@ -55,4 +57,4 @@ def test_norm(self, test_input, expected):
                 n_tagged=30,
                 punct_post_process=False,
             )
-            assert expected in pred_non_deterministic, f"input: {test_input}"
\ No newline at end of file
+            assert expected in pred_non_deterministic, f"input: {test_input}"