diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a2886d56e..fca523e58 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-yaml - id: check-case-conflict @@ -30,22 +30,22 @@ repos: - id: requirements-txt-fixer - repo: https://github.com/PyCQA/flake8 - rev: 7.2.0 + rev: 7.3.0 hooks: - id: flake8 args: - --select=W605 - repo: https://github.com/PyCQA/isort - rev: 6.0.1 + rev: 6.1.0 hooks: - id: isort name: Format imports args: [ --multi-line=3, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=119, -rc, -ws ] exclude: docs/ - - repo: https://github.com/psf/black - rev: 25.1.0 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 25.9.0 hooks: - id: black name: Format code diff --git a/Jenkinsfile b/Jenkinsfile index f29de1a90..fd74017aa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,7 +2,7 @@ pipeline { agent { docker { image 'tnitn_ci_py310:24.07' - args '-v /mnt/jenkins/jenkinsci:/home/jenkins -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' + args '-v /mnt/jenkins/jenkinsci/TestData:/home/jenkins/TestData -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' } } options { @@ -10,7 +10,6 @@ pipeline { disableConcurrentBuilds(abortPrevious: true) } environment { - AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0' EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-25-0' @@ -24,10 +23,11 @@ pipeline { SV_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/11-13-24-0' IT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-22-24-0' + HE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-24-25-0' HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-25-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-31-25-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -253,7 +253,24 @@ pipeline { } } } - + stage('L0: Create He TN/ITN Grammars & MR') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: HE ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' + } + } + } + } stage('L0: Create HY TN/ITN Grammars & MR') { when { anyOf { @@ -413,6 +430,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all HE TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/he/ -m "not pleasefixme" --cpu --tn_cache_dir ${HE_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/he/__init__.py b/nemo_text_processing/inverse_text_normalization/he/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv new file mode 100644 index 000000000..5626b7100 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv @@ -0,0 +1,2 @@ +חצי +רבע diff --git a/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv new file mode 100644 index 000000000..fbd061bc5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv @@ -0,0 +1,45 @@ +°F פרנהייט +°C צלסיוס +° מעלות +°F מעלות פרנהייט +°C מעלות צלסיוס +K קלווין +% אחוז +% אחוזים +Hz הרץ +kW קילוואט +kW קילו ואט +kW קילו וואט +kWh קילו ואט לשעה +kWh קילוואט לשעה +Wh ואט לשעה +W ואט +ghz ג׳יגה הרץ +ghz גיגה הרץ +khz קילו הרץ +mhz מגה הרץ +v וולט +nm ננומטר +mA מילי אמפר +tW טרה ואט +mv מילי וולט +mW מגה ואט +μm מיקרומטר +" אינץ׳ +cc סי סי +ω אוהם +db דציבל +db דציבלים +kb קילו ביט +mb מגה ביט +gb ג׳יגה ביט +gb גיגה ביט +tb טרה ביט +pb פטה ביט +mb מגה בייט +kb קילו בייט +gb ג׳יגה בייט +gb גיגה בייט +tb טרה בייט +pb פטה בייט +A אמפר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months.tsv new file mode 100644 index 000000000..05415cc3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months.tsv @@ -0,0 +1,13 @@ +ינואר +פברואר +מרץ +מרס +אפריל +מאי +יוני +יולי +אוגוסט +ספטמבר +אוקטובר +נובמבר +דצמבר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv new file mode 100644 index 000000000..651118ca1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv @@ -0,0 +1,12 @@ +ינואר 1 +פברואר 2 +מרץ 3 +אפריל 4 +מאי 5 +יוני 6 +יולי 7 +אוגוסט 8 +ספטמבר 9 +אוקטובר 10 +נובמבר 11 +דצמבר 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv new file mode 100644 index 000000000..e75a452d8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv @@ -0,0 +1,12 @@ +ראשון 1 +שני 2 +שלישי 3 +רביעי 4 +חמישי 5 +שישי 6 +שביעי 7 +שמיני 8 +תשיעי 9 +עשירי 10 +אחת עשרה 11 +שתיים עשרה 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv new file mode 100644 index 000000000..d88316454 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv @@ -0,0 +1,6 @@ +חצי 5 +רבע 25 +שלושת רבעי 75 +עשירית 1 +שתי עשיריות 2 +חמישית 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv new file mode 100644 index 000000000..68c02dd42 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv @@ -0,0 +1,20 @@ +אחד 1 +שניים 2 +שני 2 +שלושה 3 +ארבעה 4 +חמישה 5 +שישה 6 +שבעה 7 +שמונה 8 +תשעה 9 +אחת 1 +שתיים 2 +שתים 2 +שתי 2 +שלוש 3 +ארבע 4 +חמש 5 +שש 6 +שבע 7 +תשע 9 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv new file mode 100644 index 000000000..88e54ab57 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv @@ -0,0 +1,2 @@ +מאה 1 +מאתיים 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv new file mode 100644 index 000000000..1443e5def --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv @@ -0,0 +1 @@ +מיליון 1 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv new file mode 100644 index 000000000..26f1a5a4c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv @@ -0,0 +1,21 @@ +עשר 10 +אחד עשר 11 +שניים עשר 12 +שלושה עשר 13 +ארבעה עשר 14 +חמישה עשר 15 +שישה עשר 16 +שבעה עשר 17 +שמונה עשר 18 +תשעה עשר 19 +עשרה 10 +אחת עשרה 11 +שתיים עשרה 12 +שתים עשרה 12 +שלוש עשרה 13 +ארבע עשרה 14 +חמש עשרה 15 +שש עשרה 16 +שבע עשרה 17 +שמונה עשרה 18 +תשע עשרה 19 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv new file mode 100644 index 000000000..534789509 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv @@ -0,0 +1,8 @@ +שלושת 3 +ארבעת 4 +חמשת 5 +ששת 6 +שבעת 7 +שמונת 8 +תשעת 9 +עשרת 10 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv new file mode 100644 index 000000000..dd0c71c0d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv @@ -0,0 +1,2 @@ +אלף 1 +אלפיים 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv new file mode 100644 index 000000000..b6dd59ca3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv @@ -0,0 +1,8 @@ +עשרים 2 +שלושים 3 +ארבעים 4 +חמישים 5 +שישים 6 +שבעים 7 +שמונים 8 +תשעים 9 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv new file mode 100644 index 000000000..6a2cb1307 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv @@ -0,0 +1,15 @@ +אחד 1 +אחת 1 +שתיים 2 +שתים 2 +שלוש 3 +ארבע 4 +חמש 5 +שש 6 +שבע 7 +שמונה 8 +תשע 9 +עשר 10 +אחת עשרה 11 +שתיים עשרה 12 +שתים עשרה 12 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv new file mode 100644 index 000000000..a0b033c5d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv @@ -0,0 +1 @@ +אפס 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv new file mode 100644 index 000000000..036e1433a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv @@ -0,0 +1,10 @@ +ראשון אחד +שני שניים +שלישי שלושה +רביעי ארבעה +חמישי חמישה +שישי שישה +שביעי שבעה +שמיני שמונה +תשיעי תשעה +עשירי עשרה \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv new file mode 100644 index 000000000..988d6aedf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv @@ -0,0 +1,17 @@ +וה +שה +ב +כ +ל +מ +ה +ו +וב +ול +ש +מה +ומ +שכ +שב +בכ +לכ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv b/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv new file mode 100644 index 000000000..a97b03412 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv @@ -0,0 +1,17 @@ +ק״מ קילומטר +ק״מ קילומטרים +מ׳ מטר +מ׳ מטרים +ס״מ סנטימטר +ס״מ סנטימטרים +מ״מ מילימטר +מ״מ מילימטרים +מ״ג מיליגרם +מ״ג מיליגרמים +מ״ל מיליליטר +ק״ג קילוגרם +ק״ג קילוגרמים +קמ״ש קילומטר לשעה +קמ״ש קילומטרים לשעה +ג׳ גרם +ג׳ גרמים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv new file mode 100644 index 000000000..a4f9d2d46 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv @@ -0,0 +1,2 @@ +בבוקר +לפנות בוקר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv new file mode 100644 index 000000000..583470a05 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv @@ -0,0 +1,2 @@ +בערב +לפנות ערב \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv new file mode 100644 index 000000000..4fd47d1e2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv @@ -0,0 +1,7 @@ +5 17 +6 18 +7 19 +8 20 +9 21 +10 22 +11 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv new file mode 100644 index 000000000..656d161b2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv @@ -0,0 +1,9 @@ +8 20 +9 21 +10 22 +11 23 +12 0 +1 1 +2 2 +3 3 +4 4 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv new file mode 100644 index 000000000..8d0de9024 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv @@ -0,0 +1,7 @@ +12 12 +1 13 +2 14 +3 15 +4 16 +5 17 +6 18 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv new file mode 100644 index 000000000..5b86a39eb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv @@ -0,0 +1 @@ +חצות 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv new file mode 100644 index 000000000..38858859c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv @@ -0,0 +1,58 @@ +02 58 +03 57 +04 56 +05 55 +06 54 +07 53 +08 52 +09 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 09 +52 08 +53 07 +54 06 +55 05 +56 04 +57 03 +58 01 +59 01 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv new file mode 100644 index 000000000..8f62ae4de --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv @@ -0,0 +1,6 @@ +רבע 45 +עשרה 50 +חמישה 55 +עשרים 40 +עשרים וחמישה 35 +דקה 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv new file mode 100644 index 000000000..efa2207c3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv @@ -0,0 +1,8 @@ +שלושת רבעי 45 +חצי 30 +רבע 15 +עשרים 20 +עשרה 10 +חמישה 05 +דקה 01 +שתי 02 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv new file mode 100644 index 000000000..464aa81c0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv @@ -0,0 +1 @@ +בלילה \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv new file mode 100644 index 000000000..963d81053 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv @@ -0,0 +1,3 @@ +בצהריים +אחרי הצהריים +אחר הצהריים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv new file mode 100644 index 000000000..b5799a0b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv @@ -0,0 +1,8 @@ +בבוקר +לפנות בוקר +לפנות ערב +בערב +בצהריים +בלילה +אחרי הצהריים +אחר הצהריים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv new file mode 100644 index 000000000..5689943fd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv @@ -0,0 +1,13 @@ +אחת 12 +שתיים 1 +שלוש 2 +ארבע 3 +חמש 4 +שש 5 +שבע 6 +שמונה 7 +תשע 8 +עשר 9 +אחת עשרה 10 +שתיים עשרה 11 +חצות 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv new file mode 100644 index 000000000..9844685c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv @@ -0,0 +1,20 @@ +אח״כ אחר כך +וכו׳ וכולי +בריה״מ ברית המועצות +ארה״ב ארצות הברית +עו״ד עורך דין +עו״ד עורכת דין +עו״ד עורכי דין +עו״ד עורכות דין +רו״ח רואה חשבון +רו״ח רואת חשבון +רו״ח רואי חשבון +רו״ח רואות חשבון +לפנה״ס לפני הספירה +ד״ר דוקטור +פרופ׳ פרופסור +אמא אימא +כל כול +מאיתנו מאתנו +ישארו יישארו +יתכן ייתכן \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py new file mode 100644 index 000000000..072da0381 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -0,0 +1,119 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path + +import pynini +from pynini import Far +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, delete_space +from nemo_text_processing.text_normalization.en.utils import load_labels + +NEMO_ALPHA_HE = pynini.union(*"אבגדהוזחטיכלמםנןסעפףצץקרשת").optimize() +delete_and = pynutil.delete("ו") +delete_optional_and = delete_and.ques + +#################### +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +MINUS = pynini.union("מינוס").optimize() + + +def string_map_cased(input_file: str): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +def apply_fst(text, fst): + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. + """ + try: + print(pynini.shortestpath(text @ fst).string()) + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py new file mode 100644 index 000000000..aaf30b32c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, + GraphFst, + delete_and, + delete_optional_and, +) +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals in Hebrew + e.g. מינוס עשרים ושלוש ("minus twenty three" in Hebrew)-> cardinal { negative: "-" integer: "23" } } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + # digits + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + + # teens + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_ties += pynini.union( + delete_space + delete_optional_and + graph_digit, + pynutil.insert("0", weight=0.001), + ) + graph_two_digit = pynini.union(graph_teen, graph_ties) + + self.graph_two_digit = pynini.union(graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001)) + + # hundreds + hundred_exception = pynini.string_file(get_abs_path("data/numbers/hundreds_exception.tsv")) + delete_hundred = pynutil.delete("מאות") + graph_hundred = delete_optional_and + pynini.union( + hundred_exception, + graph_digit + delete_space + delete_hundred, + pynutil.insert("0", weight=0.001), + ) + graph_hundred += delete_space + graph_hundred += pynini.union( + delete_optional_and + graph_two_digit, + pynutil.insert("0") + delete_space + delete_and + graph_digit, + pynutil.insert("00", weight=0.001), + ) + graph_hundred = pynini.union( + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + graph_digit, + ) + self.graph_hundred = graph_hundred @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) + ) + + # thousands + thousand_exception = pynini.string_file(get_abs_path("data/numbers/thousands_exception.tsv")) + thousand_digit = pynini.string_file(get_abs_path("data/numbers/thousands.tsv")) + delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) + + large_number_prefix = pynini.union( + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + thousand_digit, + ) + many_thousands = large_number_prefix + delete_space + delete_thousand + graph_thousands = delete_optional_and + pynini.union( + (pynutil.insert("00") + thousand_exception), + many_thousands, + pynutil.insert("000", weight=0.001), + ) + + self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) + self.graph_thousands @= pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", + ) + + # millions + million_exceptions = pynini.string_file(get_abs_path("data/numbers/millions_exception.tsv")) + million_exceptions = pynutil.insert("00") + million_exceptions + delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) + many_millions = large_number_prefix + delete_space + delete_millions + graph_millions = pynini.union(many_millions, million_exceptions, pynutil.insert("000", weight=0.001)) + + graph = pynini.union( + graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, + graph_zero, + ) + graph = graph @ pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", + ) + + labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) + labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) + labels_exception += ["ו" + label for label in labels_exception] + graph_exception = pynini.union(*labels_exception).optimize() + graph = ((NEMO_ALPHA_HE + NEMO_SIGMA) @ graph).optimize() + + self.graph_no_exception = graph + + ### Token insertion + minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE + optional_minus_graph = pynini.closure(minus_graph, 0, 1) + + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + + graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + + cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) + cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) + viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() + self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph + + small_number_with_minus = ( + insert_space + minus_graph + pynutil.insert('integer: "') + self.graph_no_exception + pynutil.insert('"') + ) + + big_number_with_optional_minus = ( + optional_minus_graph + pynutil.insert('integer: "') + graph_wo_small_digits + pynutil.insert('"') + ) + + graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py new file mode 100644 index 000000000..cf9cacbd5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, insert_space + + +def _get_year_graph(graph_two_digits, graph_thousands): + """ + Transducer for year, e.g. twenty twenty -> 2020 + """ + year_graph = pynini.union( + (graph_two_digits + delete_space + graph_two_digits), + graph_thousands, # 20 19, 40 12, 20 20 + ) # 2012 - assuming no limit on the year + + year_graph.optimize() + return year_graph + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date in Hebrew, + e.g. אחד במאי אלף תשע מאות שמונים ושלוש -> date { day: "1" morphosyntactic_features: "ב" month: "5" year: "1983" } + e.g. מרץ אלף תשע מאות שמונים ותשע -> date { month: "מרץ" year: "1989" } + e.g. בינואר עשרים עשרים -> date { morphosyntactic_features: "ב" month: "ינואר" year: "2020" } + + Args: + cardinal: CardinalFst + ordinal: OrdinalFst + """ + + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + super().__init__(name="date", kind="classify") + + ordinal_graph = ordinal.graph + two_digits_graph = cardinal.graph_two_digit + + day_graph = pynutil.add_weight(two_digits_graph | ordinal_graph, -0.7) + day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"') + + month_names = pynini.string_file(get_abs_path("data/months.tsv")) + month_names_graph = pynutil.insert('month: "') + month_names + pynutil.insert('"') + + month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) + month_name2number_graph = pynutil.insert('month: "') + month_name2number + pynutil.insert('"') + + month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) + month_number2number_graph = pynutil.insert('month: "') + month_number2number + pynutil.insert('"') + + all_month_graph = month_name2number_graph | month_number2number_graph + + year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) + graph_year = delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"') + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + delete_prefix = pynutil.delete(prefix_graph) + + graph_prefix = pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + year_prefix_graph = ( + pynutil.insert('morphosyntactic_features: "') + + pynini.closure(prefix_graph, 0, 1) + + pynini.union("שנה", "שנת") + + pynutil.insert('"') + ) + + graph_dm = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + day_graph + + insert_space + + delete_space + + pynini.closure(delete_prefix + insert_space, 0, 1) + + month_name2number_graph + ) + + graph_dmy = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + day_graph + + insert_space + + delete_space + + pynini.closure(delete_prefix + insert_space, 0, 1) + + all_month_graph + + graph_year + ) + + graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year + graph_y_only = year_prefix_graph + graph_year + + final_graph = graph_dm | graph_dmy | graph_my | graph_y_only + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py new file mode 100644 index 000000000..ecefb306a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -0,0 +1,144 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import MINUS, GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, + + Args: + decimal: decimal FST + cardinal_up_to_hundred: cardinal FST + """ + numbers = cardinal_up_to_hundred @ ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + + suffix_labels = ["מיליון", "מיליארד"] + suffix = pynini.union(*suffix_labels).optimize() + + res = ( + pynutil.insert('integer_part: "') + + numbers + + pynutil.insert('"') + + delete_extra_space + + pynutil.insert('quantity: "') + + suffix + + pynutil.insert('"') + ) + res |= decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "אלף") + pynutil.insert('"') + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal in Hebrew + e.g. עשרים ושלוש וחצי -> decimal { integer_part: "23" fractional_part: "5" } + e.g. אחד נקודה שלוש -> decimal { integer_part: "1" fractional_part: "3" } + e.g. ארבע נקודה חמש מיליון -> decimal { integer_part: "4" fractional_part: "5" quantity: "מיליון" } + e.g. מינוס ארבע מאות נקודה שלוש שתיים שלוש -> decimal { negative: "true" integer_part: "400" fractional_part: "323" } + e.g. אפס נקודה שלושים ושלוש -> decimal { integer_part: "0" fractional_part: "33" } + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + + # all cardinals + cardinal_graph = cardinal.graph_no_exception + + # all fractions + fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) + fractions_graph = delete_zero_or_one_space + delete_and + fractions + fractions_graph = pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') + + # identify decimals that can be understood as time, don't convert them to avoid ambiguity + viable_minutes_exception = pynini.string_file(get_abs_path("data/decimals/minutes_exception.tsv")) + fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions + fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes + fractions_wo_minutes = pynutil.insert('fractional_part: "') + fractions_wo_minutes + pynutil.insert('"') + + graph_decimal = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_decimal |= cardinal.graph_two_digit + graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal + self.graph = graph_decimal + + point = pynutil.delete("נקודה") + + graph_negative = pynutil.insert("negative: ") + pynini.cross(MINUS, '"true"') + delete_extra_space + optional_graph_negative = pynini.closure( + graph_negative, + 0, + 1, + ) + + graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + graph_fractional = pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') + + # integer could be an hour, but minutes cannot: convert to decimal + viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes + + # integer cannot be an hour, but minutes can: convert to decimal + unviable_hour_viable_minutes = ( + pynutil.insert('integer_part: "') + + cardinal.graph_wo_viable_hours + + pynutil.insert('"') + + delete_extra_space + + fractions_graph + ) + + # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time + negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph + + # all decimals with fractions, not excluding anything (used in other FSTs) + all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph + + # only cases with fractional part that cannot be interpreted as time + graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time + + # all decimals with the word "point" + graph_w_point = ( + pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional + ) + + final_graph_wo_sign = graph_w_point | graph_wo_point + self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point + final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + + quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) + final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py new file mode 100644 index 000000000..0232c4ff6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_SPACE, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure in Hebrew + e.g. מש עשרה אחוז -> measure { cardinal { integer: "15" } units: "%" } + e.g. מינוס חמש עשרה אחוז -> measure { cardinal { negative: "-" integer: "15" } units: "%" } + e.g. שלוש מיליגרם -> measure { cardinal { integer: "3" } units: "מ״ג" } + e.g. אלף אחוז -> measure { cardinal { integer: "1000" } units: "%" } + e.g. אחוז אחד -> measure { units: "%" cardinal { integer: "1" } } + e.g. סנטימטר אחד -> measure { units: "ס״מ" cardinal { integer: "1" } } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): + super().__init__(name="measure", kind="classify") + + # optional negative sign + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE, + 0, + 1, + ) + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + + # cardinal numbers + cardinal_graph = cardinal.graph_no_exception + + # Let singular apply to values > 1 as they could be part of an adjective phrase (e.g. 14 foot tall building) + subgraph_decimal = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal.final_graph_wo_sign + + pynutil.insert(" }") + + delete_extra_space + ) + + subgraph_cardinal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert('integer: "') + + cardinal_graph + + pynutil.insert('"') + + pynutil.insert(" }") + + delete_extra_space + ) + + # convert units + joined_units = pynini.string_file(get_abs_path("data/measurements.tsv")) + joined_units = pynini.invert(joined_units) + joined_units = pynutil.insert('units: "') + joined_units + pynutil.insert('"') + + spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) + spaced_units = pynini.invert(spaced_units) + spaced_units = pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 + + # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space + units_graph = joined_units | spaced_units + + # one graph is needed since it changed the order of the words. + # We say "ten percent" for 10% but "percent one" for 1% + one = pynini.string_map([("אחד", "1")]) + one_graph = ( + insert_space + + pynutil.insert("cardinal { ") + + pynutil.insert('integer: "') + + one + + pynutil.insert('"') + + pynutil.insert(" }") + ) + + number_graph = subgraph_decimal | subgraph_cardinal + number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) + + final_graph = optional_prefix_graph + number_unit_graph + delete_zero_or_one_space + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py new file mode 100644 index 000000000..c7306ea43 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal in Hebrew + e.g. ראשון -> ordinal { integer: "1" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) + graph = NEMO_SIGMA + graph_digit + + self.graph = graph @ cardinal_graph + + final_graph = pynutil.insert('integer: "') + self.graph + pynutil.insert('"') + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py new file mode 100644 index 000000000..b963e7b74 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert('name: "') + punct + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py new file mode 100644 index 000000000..ac4965cfc --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -0,0 +1,202 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time in Hebrew. + Conversion is made only when am / pm time is not ambiguous! + e.g. שלוש דקות לחצות -> time { minutes: "57" hours: "23" } + e.g. באחת ושתי דקות בצהריים -> time { morphosyntactic_features: "ב" hours: "1" minutes: "02" suffix: "צהריים" } + e.g. שתיים ועשרה בבוקר -> time { hours: "2" minutes: "10" suffix: "בוקר" } + e.g. שתיים ועשרה בצהריים -> time { hours: "2" minutes: "10" suffix: "צהריים" } + e.g. שתיים עשרה ושלוש דקות אחרי הצהריים -> time { hours: "12" minutes: "03" suffix: "צהריים" } + e.g. רבע לשש בערב -> time { minutes: "45" hours: "5" suffix: "ערב" } + + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # hours, minutes, seconds, suffix, zone, style, speak_period + midnight_to_hour_graph = pynini.string_file(get_abs_path("data/time/midnight_to_hour.tsv")) + to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv")) + + minute_verbose_graph = pynini.string_file(get_abs_path("data/time/minute_verbose.tsv")) + minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv")) + minute_to_verbose_graph = pynini.string_file(get_abs_path("data/time/minute_to_verbose.tsv")) + + suffix_graph = pynini.union( + pynini.string_file(get_abs_path("data/time/day_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/evening_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/night_suffix.tsv")), + ) + + time_prefix = pynini.string_file(get_abs_path("data/prefix.tsv")) + time_prefix_graph = ( + pynutil.insert('morphosyntactic_features: "') + time_prefix + pynutil.insert('"') + insert_space + ) + optional_time_prefix_graph = pynini.closure(time_prefix_graph, 0, 1) + + # only used for < 1000 thousand -> 0 weight + cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) + + labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] + labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] + labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] + + graph_hour = pynini.union(*labels_hour) @ cardinal + graph_hour |= midnight_to_hour_graph + add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT + graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit + graph_minute_double = pynini.union(*labels_minute_double) @ cardinal + + final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') + + graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) + + final_suffix = pynutil.insert('suffix: "') + suffix_graph + pynutil.insert('"') + final_suffix = delete_space + insert_space + final_suffix + + time_word = "דקות" + optional_delete_time = pynini.closure(delete_space + pynutil.delete(time_word), 0, 1) + graph_h_and_m = ( + final_graph_hour + + delete_space + + delete_and + + insert_space + + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double, minute_verbose_graph) + + pynutil.insert('"') + + optional_delete_time + ) + + graph_special_m_to_h_suffix_time = ( + pynutil.insert('minutes: "') + + minute_to_verbose_graph + + pynutil.insert('"') + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + graph_m_to_h_suffix_time = ( + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + + pynutil.insert('"') + + optional_delete_time + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + graph_h = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + final_graph_hour + + delete_extra_space + + pynutil.insert('minutes: "') + + (pynutil.insert("00") | graph_minute) + + pynutil.insert('"') + + final_suffix + ) + + midnight_graph = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + pynutil.insert('hours: "') + + midnight_to_hour_graph + + pynutil.insert('"') + + insert_space + + pynutil.insert('minutes: "') + + (pynutil.insert("00") | graph_minute) + + pynutil.insert('"') + ) + + graph_midnight_and_m = ( + pynutil.insert('hours: "') + + midnight_to_hour_graph + + pynutil.insert('"') + + delete_space + + delete_and + + insert_space + + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double, minute_verbose_graph) + + pynutil.insert('"') + + optional_delete_time + ) + + to_midnight_verbose_graph = ( + pynutil.insert('minutes: "') + + minute_to_verbose_graph + + pynutil.insert('"') + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + graph_m_to_midnight = ( + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + + pynutil.insert('"') + + optional_delete_time + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + final_graph_midnight = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) + ) + + final_graph = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + + final_suffix + ) + final_graph |= graph_h + final_graph |= final_graph_midnight + + final_graph = self.add_tokens(final_graph.optimize()) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..807dcf734 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + input_case: str = None, + ): + + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"he_itn.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + date_graph = DateFst(ordinal=ordinal, cardinal=cardinal).fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + whitelist_graph = WhiteListFst(input_file=whitelist).fst + punct_graph = PunctuationFst().fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py new file mode 100644 index 000000000..58de7668e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, string_map_cased +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import convert_space, insert_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. misses -> tokens { name: "mrs." } + This class has highest priority among all classifier grammars. + Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified). + + Args: + input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n), + e.g. nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv + """ + + def __init__(self, input_file: str = None): + super().__init__(name="whitelist", kind="classify") + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + + if input_file is None: + input_file = get_abs_path("data/whitelist.tsv") + + if not os.path.exists(input_file): + raise ValueError(f"Whitelist file {input_file} not found") + + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + whitelist = string_map_cased(input_file) + graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') + final_graph = optional_prefix_graph + graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py new file mode 100644 index 000000000..6b5394ac3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py new file mode 100644 index 000000000..1aa996b80 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -0,0 +1,182 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +#################### +# HEBREW CONSTANTS # +#################### +units_feminine_dict = { + "0": "אפס", + "1": "אחת", + "2": "שתיים", + "3": "שלוש", + "4": "ארבע", + "5": "חמש", + "6": "שש", + "7": "שבע", + "8": "שמונה", + "9": "תשע", +} + +units_masculine_dict = { + "0": "אפס", + "1": "אחד", + "2": "שניים", + "3": "שלושה", + "4": "ארבעה", + "5": "חמישה", + "6": "שישה", + "7": "שבעה", + "8": "שמונה", + "9": "תשעה", +} + +tens_dict = { + "2": "עשרים", + "3": "שלושים", + "4": "ארבעים", + "5": "חמישים", + "6": "שישים", + "7": "שבעים", + "8": "שמונים", + "9": "תשעים", +} + +ten = { + "short": "עשר", + "long": "עשרה", +} # double pronunciation: short is 'eser' and 'asar', long is 'esre' and 'asara' + + +############# +# FUNCTIONS # +############# +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res + + +def digit_by_digit(num): + + dbd = [" ".join([units_feminine_dict[digit] for digit in num])] + + # generate "1" as masculine and as feminine if exists + if units_feminine_dict["1"] in dbd[0]: + dbd.append(dbd[0].replace(units_feminine_dict["1"], units_masculine_dict["1"])) + + return dbd + + +def integer_to_text(num, only_fem=False): + if isinstance(num, int): + num = str(num) + # number is zero + if num == len(num) * "0": + return ["אפס"] + else: + # remove leading zeros from number + num = num.lstrip("0") + + # units + if len(num) == 1: + return _less_than_10(num, only_fem) + + # tenths + elif len(num) == 2: + return _less_than_100(num, only_fem) + + else: + raise Exception + + +def _less_than_10(num, only_fem=False): + """ + Returns a list of all the possible names of a number in range 0-9 + """ + + if only_fem: + return [units_feminine_dict[num]] + else: + return [units_feminine_dict[num], units_masculine_dict[num]] + + +def _less_than_100(num, only_fem=False): + """ + Returns a list of all the possible names of a number in range 0-99 + """ + + # init result + res = list() + + # split number to digits + tens, units = num + + # number is in range 0-9 + if len(num) == 1: + res.extend(_less_than_10(num)) + + # number is in range 10-99 + elif len(num) == 2: + + if num == "10": + if only_fem: + res.extend([ten["short"]]) + else: + res.extend([ten["long"], ten["short"]]) + + # number is in range 11-19 + elif tens == "1": + res.append(f'{units_feminine_dict[num[1]]} {ten["long"]}') + if not only_fem: + res.append(f'{units_masculine_dict[num[1]]} {ten["short"]}') + + else: + + # number is in range 20-99, a multiplication of 10 + if units == "0": + res.append(tens_dict[num[0]]) + + # number is in range 20-99, but not multiplication of 10 + else: + res.append(f'{tens_dict[num[0]]} {"ו"}{units_feminine_dict[num[1]]}') + if not only_fem: + res.append(f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}') + + return res diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py new file mode 100644 index 000000000..d26e1f703 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -0,0 +1,84 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal in Hebrew + e.g. cardinal { prefix: "וב" integer: "3405"} -> וב-3,405 + e.g. cardinal { negative: "-" integer: "904" } -> -904 + e.g. cardinal { prefix: "כ" integer: "123" } -> כ-123 + + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + # Need parser to group digits by threes + exactly_three_digits = NEMO_DIGIT**3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # Thousands separator + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + # Removes the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete('"') + + pynini.accep("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + # removes integer aspect + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit + + pynutil.delete('"') + ) + + # Add thousands separator + graph = graph @ group_by_threes + + self.numbers = graph + + # add prefix and sign + graph = optional_prefix + optional_sign + graph + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py new file mode 100644 index 000000000..4a1b24599 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -0,0 +1,120 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SPACE, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, + e.g. { day_prefix: "ה" day: "1" month_prefix: "ב" month: "6" year: "2012" } -> ה-1.6.2012 + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + day_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + ) + + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1, 2) + + pynutil.insert(".") + + pynutil.delete('"') + + delete_space + ) + + month_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + + delete_space + ) + + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + year_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 3) + + pynutil.delete('"') + + delete_space + ) + + year = ( + pynutil.delete("year:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + ####################### + # DATE FORMATS GRAPHS # + ####################### + + # day and month only + graph_dm = ( + pynini.closure(day_prefix + delete_zero_or_one_space, 0, 1) + + day + + pynini.closure(delete_zero_or_one_space, 0, 1) + + month + + delete_zero_or_one_space + ) + + # day month and year + graph_dmy = graph_dm + delete_space + pynutil.insert(".") + delete_zero_or_one_space + year + + # only month and year + graph_my = ( + pynini.closure(month_prefix + delete_zero_or_one_space, 0, 1) + + month + + pynutil.insert(NEMO_SPACE) + + pynini.closure(delete_zero_or_one_space + year, 0, 1) + ) + + # only year + graph_y_only = year_prefix + insert_space + year + + final_graph = (graph_dm | graph_dmy | graph_my | graph_y_only) + delete_space + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py new file mode 100644 index 000000000..ea69ab784 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal, + e.g. decimal { integer_part: "0" fractional_part: "33" } -> 0.33 + e.g. decimal { negative: "true" integer_part: "400" fractional_part: "323" } -> -400.323 + e.g. decimal { integer_part: "4" fractional_part: "5" quantity: "מיליון" } -> 4.5 מיליון + + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1) + + # Need parser to group digits by threes + exactly_three_digits = NEMO_DIGIT**3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # Thousands separator + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + integer = integer @ group_by_threes + + optional_integer = pynini.closure(integer + delete_space, 0, 1) + + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) + + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + graph = optional_prefix + optional_integer + optional_fractional + optional_quantity + self.numbers = graph + graph = optionl_sign + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py new file mode 100644 index 000000000..a4aadd67b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -0,0 +1,107 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_CHAR, + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, in Hebrew. + Some measures are concatenated to the numbers and other are don't (two measure lists) + e.g. measure { cardinal { integer: "3" } units: "מ״ג" } -> 3 מ״ג + e.g. measure { cardinal { integer: "1000" } units: "%" } -> 1,000% + e.g. measure { units: "%" cardinal { integer: "1" } } -> 1% + e.g. measure { units: "ס״מ" cardinal { integer: "1" } } -> 1 ס״מ + e.g. measure { prefix: "ל" cardinal { integer: "4" } units: "ס״מ" } -> ל-4 ס״מ + + Args: + decimal: DecimalFst + cardinal: CardinalFst + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst): + super().__init__(name="measure", kind="verbalize") + + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + # Removes the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete('"') + + pynini.accep("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + optional_sign + + decimal.numbers + + delete_space + + pynutil.delete("}") + ) + + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) + + unit = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete('"') + + delete_space + ) + unit @= pynini.cdrewrite( + pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA # noqa: W605 + ) # For space separated measures. + + numbers_units = delete_space + unit + numbers_graph = (graph_cardinal | graph_decimal) + numbers_units + + one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete('cardinal { integer: "1" }') + + graph = optional_prefix + (numbers_graph | one_graph) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py new file mode 100644 index 000000000..a85f5b019 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing ordinal in Hebrew + e.g. ordinal { integer: "10" } -> 10 + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + ) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py new file mode 100644 index 000000000..3d41b783b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time in Hebrew + e.g. time { hours: "2" minutes: "55" suffix: "בלילה" } -> 2:55 בלילה + e.g. time { hours: "2" minutes: "57" suffix: "בבוקר" } -> 2:57 בבוקר + e.g. time { morphosyntactic_features: "ב" hours: "6" minutes: "32" suffix: "בערב" } -> ב-18:32 בערב + e.g. time { morphosyntactic_features: "בשעה" hours: "2" minutes: "10" suffix: "בצהריים" } -> בשעה-14:10 בצהריים + + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hour_to_noon = pynini.string_file(get_abs_path("data/time/hour_to_noon.tsv")) + hour_to_evening = pynini.string_file(get_abs_path("data/time/hour_to_evening.tsv")) + hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv")) + + day_suffixes = pynini.string_file(get_abs_path("data/time/day_suffix.tsv")) + day_suffixes = insert_space + pynutil.delete('suffix: "') + day_suffixes + pynutil.delete('"') + + noon_suffixes = pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")) + noon_suffixes = insert_space + pynutil.delete('suffix: "') + noon_suffixes + pynutil.delete('"') + + evening_suffixes = pynini.string_file(get_abs_path("data/time/evening_suffix.tsv")) + evening_suffixes = insert_space + pynutil.delete('suffix: "') + evening_suffixes + pynutil.delete('"') + + night_suffixes = pynini.string_file(get_abs_path("data/time/night_suffix.tsv")) + night_suffixes = insert_space + pynutil.delete('suffix: "') + night_suffixes + pynutil.delete('"') + + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + ) + + minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + ) + + prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + ) + + optional_prefix = pynini.closure(prefix + delete_zero_or_one_space, 0, 1) + optional_suffix = pynini.closure(delete_space + day_suffixes, 0, 1) + graph = hour + delete_space + pynutil.insert(":") + minute + optional_suffix + + for hour_to, suffix in zip( + [hour_to_noon, hour_to_evening, hour_to_night], + [noon_suffixes, evening_suffixes, night_suffixes], + ): + graph |= hour @ hour_to + delete_space + pynutil.insert(":") + minute + delete_space + suffix + + graph |= optional_prefix + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py new file mode 100644 index 000000000..0223259db --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars in Hebrew. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal_graph = OrdinalFst().fst + + decimal = DecimalFst() + decimal_graph = decimal.fst + + measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + + time_graph = TimeFst().fst + + date_graph = DateFst().fst + + whitelist_graph = WhiteListFst().fst + + graph = ( + time_graph | date_graph | measure_graph | ordinal_graph | decimal_graph | cardinal_graph | whitelist_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py new file mode 100644 index 000000000..611181df4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py @@ -0,0 +1,44 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence in Hebrew + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py new file mode 100644 index 000000000..0607e0b37 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "mrs." } -> mrs. + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete('"') + ) + graph = graph @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) + final_graph = optional_prefix + graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py new file mode 100644 index 000000000..49c61cf6a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') + graph = char @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..da85318b1 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'he': # Japanese + from nemo_text_processing.inverse_text_normalization.he.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'he', 'hi', 'hy', 'mr', 'ja'], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv new file mode 100644 index 000000000..6fcfb8b3a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv @@ -0,0 +1,5 @@ +हफ़्ते +सप्ताह +सदियां +सदियों + diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv similarity index 77% rename from nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv rename to nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv index eaddf930a..dc20bcb21 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv @@ -4,9 +4,8 @@ h घंटे min मिनट doz दर्जन yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना months महीने -हफ़्ते हफ़्ते \ No newline at end of file + diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 189512687..4065bc86b 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -134,7 +134,6 @@ KHz किलोहर्ट्ज़ N न्यूटन dB डेसीबल yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py b/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv new file mode 100644 index 000000000..bfe5738d0 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv @@ -0,0 +1,12 @@ +१ला पहला +१ली पहली +२रा दूसरा +२री दूसरी +३रा तीसरा +३री तीसरी +४था चौथा +४थी चौथी +५वां पाँचवां +५वीं पाँचवीं +६ठा छठा +६ठी छठी diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv new file mode 100644 index 000000000..922e9d6b8 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -0,0 +1,3 @@ +वां +वीं +वें diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv new file mode 100644 index 000000000..77139cff5 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv @@ -0,0 +1,2 @@ +वे वें + diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv new file mode 100644 index 000000000..46b485af6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv @@ -0,0 +1,3 @@ +नंबर +कार्ड +क्रेडिट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv new file mode 100644 index 000000000..17a123bee --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv @@ -0,0 +1,5 @@ +नंबर +मोबाइल +फोन +लैंडलाइन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv new file mode 100644 index 000000000..f2fa6e52f --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv @@ -0,0 +1,4 @@ +नंबर +मोबाइल +फोन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv new file mode 100644 index 000000000..e8c04b723 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv @@ -0,0 +1,10 @@ +0 शून्य +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv new file mode 100644 index 000000000..322c7248e --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv @@ -0,0 +1,4 @@ +नंबर +पिन +कोड +पिनकोड \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv new file mode 100644 index 000000000..3477871e4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv @@ -0,0 +1,100 @@ +० एक +१ दो +२ तीन +३ चार +४ पाँच +५ छह +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१७ अठारह +१८ उन्नीस +१९ बीस +२० इक्कीस +२१ बाईस +२२ तेईस +२३ चौबीस +२४ पच्चीस +२५ छब्बीस +२६ सत्ताईस +२७ अट्ठाईस +२८ उनतीस +२९ तीस +३० इकतीस +३१ बत्तीस +३२ तैंतीस +३३ चौंतीस +३४ पैंतीस +३५ छत्तीस +३६ सैंतीस +३७ अड़तीस +३८ उनतालीस +३९ चालीस +४० इकतालीस +४१ बयालीस +४२ तैंतालीस +४३ चौवालीस +४४ पैंतालीस +४५ छियालीस +४६ सैंतालीस +४७ अड़तालीस +४८ उनचास +४९ पचास +५० इक्यावन +५१ बावन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५६ सत्तावन +५७ अट्ठावन +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६९ सत्तर +७० इकहत्तर +७१ बहत्तर +७२ तिहत्तर +७३ चौहत्तर +७४ पचहत्तर +७५ छिहत्तर +७६ सतहत्तर +७७ अठहत्तर +७८ उनासी +७९ अस्सी +८० इक्यासी +८१ बयासी +८२ तिरासी +८३ चौरासी +८४ पचासी +८५ छियासी +८६ सत्तासी +८७ अट्ठासी +८८ नवासी +८९ नब्बे +९० इक्यानबे +९१ बानबे +९२ तिरानबे +९३ चौरानबे +९४ पंचानबे +९५ छियानबे +९६ सत्तानबे +९७ अट्ठानबे +९८ निन्यानबे +९९ एक सौ diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index 6a5d3c699..5bbc736fd 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -30,6 +30,13 @@ NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_ZERO = "०" + +HI_DEDH = "डेढ़" # 1.5 +HI_DHAI = "ढाई" # 2.5 +HI_SAVVA = "सवा" # quarter more (1.25) +HI_SADHE = "साढ़े" # half more (X.5) +HI_PAUNE = "पौने" # quarter less (0.75) + NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index c50384acf..f361416f4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -15,18 +15,18 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space from nemo_text_processing.text_normalization.hi.utils import get_abs_path class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - -२३ -> cardinal { negative: "true" integer: "तेइस" } } - s - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True, lm: bool = False): @@ -37,6 +37,10 @@ def __init__(self, deterministic: bool = True, lm: bool = False): teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + self.digit = digit + self.zero = zero + self.teens_and_ties = teens_and_ties + def create_graph_suffix(digit_graph, suffix, zeros_counts): zero = pynutil.add_weight(pynutil.delete("०"), -0.1) if zeros_counts == 0: @@ -294,6 +298,12 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas) graph_ten_shankhs.optimize() + # Only match exactly 2 digits to avoid interfering with telephone numbers, decimals, etc. + # e.g., "०५" -> "शून्य पाँच" + single_digit = digit | zero + graph_leading_zero = zero + insert_space + single_digit + graph_leading_zero = pynutil.add_weight(graph_leading_zero, 0.5) + final_graph = ( digit | zero @@ -315,6 +325,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): | graph_ten_padmas | graph_shankhs | graph_ten_shankhs + | graph_leading_zero ) optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 37b192165..b25abcac6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -65,11 +65,11 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) - cardinal_graph = ( - digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + cardinal_graph = pynini.union( + digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands ) - graph_year = graph_year_thousands | graph_year_hundreds_as_thousands + graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") @@ -102,13 +102,10 @@ def __init__(self, cardinal: GraphFst): # Updated logic to use prefix_union year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") - graph_dd_mm_yyyy = ( - days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph - ) + delete_separator = pynini.union(delete_dash, delete_slash) + graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph - graph_mm_dd_yyyy = ( - months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph - ) + graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index 955e8c0d3..cb21d85b1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -58,9 +58,7 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_digit |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - + graph_digit = cardinal.digit | cardinal.zero cardinal_graph = cardinal.final_graph self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index 8971cd3dd..b5528deba 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -15,7 +15,20 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +HI_ONE_HALF = "१/२" # 1/2 +HI_ONE_QUARTER = "१/४" # 1/4 +HI_THREE_QUARTERS = "३/४" # 3/4 class FractionFst(GraphFst): @@ -39,21 +52,76 @@ def __init__(self, cardinal, deterministic: bool = True): cardinal_graph = cardinal.final_graph self.optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1 ) self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") self.numerator = ( - pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ") + pynutil.insert("numerator: \"") + + cardinal_graph + + pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"") + + pynutil.insert(NEMO_SPACE) ) self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - self.graph = ( + dedh_dhai_graph = pynini.string_map( + [("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)] + ) + + savva_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_QUARTER, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_HALF, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + final_graph = ( self.optional_graph_negative - + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + + pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1) + self.numerator + self.denominator ) + weighted_graph = ( + final_graph + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.2) + ) + + self.graph = weighted_graph + graph = self.graph - final_graph = self.add_tokens(graph) - self.fst = final_graph.optimize() + graph = self.add_tokens(graph) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 9f1ffbd39..b7d74731e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +HI_POINT_FIVE = ".५" # .5 +HI_ONE_POINT_FIVE = "१.५" # 1.5 +HI_TWO_POINT_FIVE = "२.५" # 2.5 +HI_DECIMAL_25 = ".२५" # .25 +HI_DECIMAL_75 = ".७५" # .75 digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) @@ -41,8 +56,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = ( - digit - | teens_and_ties + cardinal.zero + | cardinal.digit + | cardinal.teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands | cardinal.graph_ten_thousands @@ -53,7 +69,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) - quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) + + # Load quarterly units from separate files: map (FST) and list (FSA) + quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv")) + quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv")) + quarterly_units_graph = pynini.union(quarterly_units_map, quarterly_units_list) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, @@ -64,16 +84,28 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Define the quarterly measurements quarter = pynini.string_map( [ - (".५", "साढ़े"), - ("१.५", "डेढ़"), - ("२.५", "ढाई"), + (HI_POINT_FIVE, HI_SADHE), + (HI_ONE_POINT_FIVE, HI_DEDH), + (HI_TWO_POINT_FIVE, HI_DHAI), ] ) quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling - unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") - units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") + unit = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + unit_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + units = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + quarterly_units_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) # Handling symbols like x, X, * symbol_graph = pynini.string_map( @@ -93,10 +125,71 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - graph_quarter = ( + dedh_dhai = pynini.string_map([(HI_ONE_POINT_FIVE, HI_DEDH), (HI_TWO_POINT_FIVE, HI_DHAI)]) + dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"") + + savva_numbers = cardinal_graph + pynini.cross(HI_DECIMAL_25, "") + savva_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SAVVA) + + pynutil.insert(NEMO_SPACE) + + savva_numbers + + pynutil.insert("\"") + ) + + sadhe_numbers = cardinal_graph + pynini.cross(HI_POINT_FIVE, "") + sadhe_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SADHE) + + pynutil.insert(NEMO_SPACE) + + sadhe_numbers + + pynutil.insert("\"") + ) + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(HI_DECIMAL_75, "") + paune_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_PAUNE) + + pynutil.insert(NEMO_SPACE) + + paune_numbers + + pynutil.insert("\"") + ) + + graph_dedh_dhai = ( pynutil.insert("cardinal { ") + optional_graph_negative - + quarter_graph + + dedh_dhai_graph + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + units + ) + + graph_savva = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + savva_graph + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + units + ) + + graph_sadhe = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + sadhe_graph + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + units + ) + + graph_paune = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + paune_graph + pynutil.insert(" }") + delete_space + units @@ -108,7 +201,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("integer: \"") + cardinal_graph + pynutil.insert("\"") - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + unit ) @@ -121,9 +215,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + cardinal_graph + pynutil.insert("\"") + pynutil.insert(" }") - + pynutil.insert(" units: \"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + symbol_graph - + pynutil.insert("\" ") + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + pynutil.insert("} }") + insert_space + pynutil.insert("tokens { cardinal { ") @@ -134,10 +230,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph = ( - pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_quarter, 0.005) - | pynutil.add_weight(graph_cardinal, 0.01) - | pynutil.add_weight(graph_exceptions, 0.01) + pynutil.add_weight(graph_decimal, 0.1) + | pynutil.add_weight(graph_cardinal, 0.1) + | pynutil.add_weight(graph_exceptions, 0.1) + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.5) ) self.graph = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py new file mode 100644 index 000000000..5f1cefed4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying Hindi ordinals, e.g. + १०वां -> ordinal { integer: "दसवां" } + २१वीं -> ordinal { integer: "इक्कीसवीं" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: CardinalFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + + suffixes_list = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + suffixes_map = pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv")) + suffixes_fst = pynini.union(suffixes_list, suffixes_map) + exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv")) + + graph = cardinal.final_graph + suffixes_fst + exceptions = pynutil.add_weight(exceptions, -0.1) + graph = pynini.union(exceptions, graph) + + final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py index 8309ba030..14c9a1a55 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py @@ -36,9 +36,9 @@ def __init__(self, deterministic: bool = True): emphasis = ( pynini.accep("<") - + ( - (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) - | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) + + pynini.union( + (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)), + (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)), ) + pynini.accep(">") ) diff --git a/nemo_text_processing/text_normalization/hi/taggers/telephone.py b/nemo_text_processing/text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..d20870c0d --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/telephone.py @@ -0,0 +1,228 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_DIGIT, + NEMO_HI_DIGIT, + NEMO_SPACE, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +HI_ZERO_DIGIT = pynini.union("0", "०") +HI_MOBILE_START_DIGITS = pynini.union("६", "७", "८", "९", "6", "7", "8", "9").optimize() +HI_LANDLINE_START_DIGITS = pynini.union("२", "३", "४", "६", "2", "3", "4", "6").optimize() + +delete_zero = pynutil.delete(HI_ZERO_DIGIT) +delete_zero_optional = pynini.closure(delete_zero, 0, 1) +insert_shunya = pynutil.insert('शून्य') + insert_space + +# Load the number mappings from the TSV file +digit_to_word = pynini.string_file(get_abs_path("data/telephone/number.tsv")) +digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) +mobile_context = pynini.string_file(get_abs_path("data/telephone/mobile_context.tsv")) +landline_context = pynini.string_file(get_abs_path("data/telephone/landline_context.tsv")) +credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv")) +pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv")) + +# Reusable optimized graph for any digit token +num_token = pynini.union(digit_to_word, digits, zero).optimize() + + +def generate_mobile(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + + # Filter cardinals to only include allowed digits + mobile_start_digit = pynini.union(HI_MOBILE_START_DIGITS @ digits, HI_MOBILE_START_DIGITS @ digit_to_word) + + country_code_digits = pynini.closure(num_token + insert_space, 1, 3) + country_code = ( + pynutil.insert("country_code: \"") + + context_before + + pynini.cross("+", "प्लस") + + insert_space + + country_code_digits + + pynutil.insert("\" ") + + pynini.closure(delete_space, 0, 1) + ) + + extension_optional = pynini.closure( + pynutil.insert("extension: \"") + + pynini.closure(num_token + insert_space, 1, 3) + + context_after + + pynutil.insert("\" ") + + delete_space, + 0, + 1, + ) + + number_part = mobile_start_digit + insert_space + pynini.closure(num_token + insert_space, 9) + + number_without_country = ( + pynutil.insert("number_part: \"") + + context_before + + delete_zero_optional + + insert_shunya + + number_part + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + number_with_country = ( + country_code + + pynutil.insert("number_part: \"") + + number_part + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + return (pynini.union(number_with_country, number_without_country) + extension_optional).optimize() + + +def get_landline(std_length: int, context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + + # Filter cardinals to only include allowed digits + landline_start_digit = pynini.union(HI_LANDLINE_START_DIGITS @ digits, HI_LANDLINE_START_DIGITS @ digit_to_word) + + std_code_graph = ( + delete_zero_optional + insert_shunya + pynini.closure(num_token + insert_space, std_length, std_length) + ) + + landline_digit_count = 9 - std_length + landline_graph = ( + landline_start_digit + + insert_space + + pynini.closure(num_token + insert_space, landline_digit_count, landline_digit_count) + ) + + separator_optional = pynini.closure(pynini.union(pynini.cross("-", ""), pynini.cross(".", "")), 0, 1) + + std_code_in_brackets = ( + delete_zero_optional + + delete_space + + pynutil.delete("(") + + pynini.closure(delete_space, 0, 1) + + std_code_graph + + pynini.closure(delete_space, 0, 1) + + pynutil.delete(")") + ) + + std_part = pynini.union(std_code_graph, std_code_in_brackets) + + return ( + pynutil.insert("number_part: \"") + + context_before + + std_part + + separator_optional + + delete_space + + landline_graph + + context_after + + pynutil.insert("\" ") + ).optimize() + + +def generate_landline(context_keywords: pynini.Fst) -> pynini.Fst: + graph = ( + get_landline(2, context_keywords) + | get_landline(3, context_keywords) + | get_landline(4, context_keywords) + | get_landline(5, context_keywords) + | get_landline(6, context_keywords) + | get_landline(7, context_keywords) + ) + + return graph.optimize() + + +def get_context(keywords: pynini.Fst): + + all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT) + + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) + word = pynini.closure(non_digit_char, 1) + pynini.accep(NEMO_SPACE) + + window = pynini.closure(word, 0, 5) + + before = pynini.closure(keywords + pynini.accep(NEMO_SPACE) + window, 0, 1) + + after = pynini.closure(pynutil.delete(NEMO_SPACE) + window + keywords, 0, 1) + + return before.optimize(), after.optimize() + + +def generate_credit(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure(num_token + insert_space, 4) + + context_after + + pynutil.insert("\" ") + + delete_space + ).optimize() + + +def generate_pincode(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure(num_token + insert_space, 6) + + context_after + + pynutil.insert("\" ") + + delete_space + ).optimize() + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for tagging telephone numbers, e.g. + ९१५७११४००७ -> telephone { number_part: "शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात" } + +९१ ९२१०५१५६०६ -> telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } + १३७४-३०९९८८ -> telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + + mobile_number = generate_mobile(mobile_context) + landline = generate_landline(landline_context) + credit_card = generate_credit(credit_context) + pincode = generate_pincode(pincode_context) + + graph = ( + pynutil.add_weight(mobile_number, 0.7) + | pynutil.add_weight(landline, 0.8) + | pynutil.add_weight(credit_card, 0.9) + | pynutil.add_weight(pincode, 1) + ) + + self.final = graph.optimize() + self.fst = self.add_tokens(self.final) diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 6c87c9aad..09defaab2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +# Time patterns specific to time tagger +HI_DOUBLE_ZERO = "००" +HI_TIME_FIFTEEN = ":१५" # :15 +HI_TIME_THIRTY = ":३०" # :30 +HI_TIME_FORTYFIVE = ":४५" # :45 + hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv")) minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv")) seconds_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv")) @@ -36,10 +51,11 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") delete_colon = pynutil.delete(":") + cardinal_graph = cardinal.digit | cardinal.teens_and_ties self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ") self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ") @@ -54,9 +70,57 @@ def __init__(self): graph_hm = self.hours + delete_colon + insert_space + self.minutes # hour - graph_h = self.hours + delete_colon + pynutil.delete("००") + graph_h = self.hours + delete_colon + pynutil.delete(HI_DOUBLE_ZERO) + + dedh_dhai_graph = pynini.string_map([("१" + HI_TIME_THIRTY, HI_DEDH), ("२" + HI_TIME_THIRTY, HI_DHAI)]) + + savva_numbers = cardinal_graph + pynini.cross(HI_TIME_FIFTEEN, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(HI_TIME_THIRTY, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(HI_TIME_FORTYFIVE, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - final_graph = graph_hms | graph_hm | graph_h + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + final_graph = ( + graph_hms + | pynutil.add_weight(graph_hm, 0.3) + | pynutil.add_weight(graph_h, 0.3) + | pynutil.add_weight(graph_dedh_dhai, 0.1) + | pynutil.add_weight(graph_savva, 0.2) + | pynutil.add_weight(graph_sadhe, 0.2) + | pynutil.add_weight(graph_paune, 0.1) + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index b1bbd2a10..e3e6fc5d8 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -14,12 +14,12 @@ import logging import os -import time import pynini from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_SPACE, NEMO_WHITE_SPACE, GraphFst, delete_extra_space, @@ -32,7 +32,9 @@ from nemo_text_processing.text_normalization.hi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.hi.taggers.word import WordFst @@ -77,51 +79,39 @@ def __init__( else: logging.info(f"Creating ClassifyFst grammars.") - start_time = time.time() cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst - logging.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes") - start_time = time.time() decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst - logging.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes") - start_time = time.time() fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) fraction_graph = fraction.fst - logging.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") - start_time = time.time() date = DateFst(cardinal=cardinal) date_graph = date.fst - logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") - start_time = time.time() - timefst = TimeFst() + timefst = TimeFst(cardinal=cardinal) time_graph = timefst.fst - logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") - start_time = time.time() measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") - start_time = time.time() money = MoneyFst(cardinal=cardinal) money_graph = money.fst - logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") - start_time = time.time() + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + ordinal_graph = ordinal.fst + whitelist_graph = WhiteListFst( input_case=input_case, deterministic=deterministic, input_file=whitelist ).fst - logging.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes") - start_time = time.time() punctuation = PunctuationFst(deterministic=deterministic) punct_graph = punctuation.fst - logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") + + telephone = TelephoneFst() + telephone_graph = telephone.fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -132,35 +122,39 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) ) - start_time = time.time() word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst - logging.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct), + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct), + ), 1, ) - classify |= pynutil.add_weight(word_graph, 100) + classify = pynini.union(classify, pynutil.add_weight(word_graph, 100)) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(NEMO_SPACE)) + + token + + pynini.closure(pynutil.insert(NEMO_SPACE) + punct) ) graph = token_plus_punct + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct + pynutil.insert(" ")) + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)), ) + token_plus_punct ) graph = delete_space + graph + delete_space - graph |= punct + graph = pynini.union(graph, punct) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index bc354232b..00feb1827 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -40,10 +40,9 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): # Define Hindi characters and symbols using pynini.union HINDI_CHAR = pynini.union( - *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants - *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters - *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics - *[chr(i) for i in range(ord("०"), ord("९") + 1)], # Hindi digits + *[chr(i) for i in range(0x0900, 0x0903 + 1)], # Hindi vowels and consonants + *[chr(i) for i in range(0x0905, 0x0939 + 1)], # More Hindi characters + *[chr(i) for i in range(0x093E, 0x094D + 1)], # Hindi diacritics ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index 7e3b33b7c..a07c41eae 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -40,6 +40,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") insert_aur = pynutil.insert(" और ") + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) fraction_default = numerator + insert_bata + denominator @@ -47,7 +50,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + fraction_default - ) + ) | graph_quarter graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py new file mode 100644 index 000000000..ab88603f6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing Hindi ordinals, e.g. + ordinal { integer: "दसवां" } -> दसवां + ordinal { integer: "इक्कीसवीं" } -> इक्कीसवीं + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + + integer_value = delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph = pynutil.delete("integer:") + integer_value + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..55ebeab01 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,72 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + MIN_NEG_WEIGHT, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone numbers, e.g. + telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } -> प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह + telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } -> शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + + insert_space, + 0, + 1, + ) + + number_part = ( + pynutil.delete("number_part: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(pynutil.add_weight(pynutil.delete(NEMO_SPACE), MIN_NEG_WEIGHT), 0, 1) + + pynutil.delete("\"") + ) + + optional_extension = pynini.closure( + delete_space + + insert_space + + pynutil.delete("extension: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\""), + 0, + 1, + ) + + graph = optional_country_code + number_part + optional_extension + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py index da10df4a0..df232e3cd 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -30,7 +30,7 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="verbalize") hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space @@ -63,13 +63,17 @@ def __init__(self): + insert_second ) + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + # hour minute graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute # hour graph_h = hour + delete_space + insert_baje - self.graph = graph_hms | graph_hm | graph_h + self.graph = graph_hms | graph_hm | graph_h | graph_quarter final_graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index e91f0d9f6..12ae316b1 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -19,6 +19,8 @@ from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -49,7 +51,7 @@ def __init__(self, deterministic: bool = True): date = DateFst() date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal=cardinal) time_graph = time.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal) @@ -58,6 +60,11 @@ def __init__(self, deterministic: bool = True): money = MoneyFst() money_graph = money.fst + telephone = TelephoneFst() + telephone_graph = telephone.fst + ordinal = OrdinalFst(deterministic=deterministic) + ordinal_graph = ordinal.fst + whitelist_graph = WhiteListFst(deterministic=deterministic).fst graph = ( @@ -68,7 +75,9 @@ def __init__(self, deterministic: bool = True): | time_graph | measure_graph | money_graph + | ordinal_graph | whitelist_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/he/__init__.py b/tests/nemo_text_processing/he/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/tests/nemo_text_processing/he/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..cfb6f8db0 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,138 @@ +אפס~אפס +מינוס שלוש~-3 +עשר~עשר +שלוש עשרה~13 +שלושה עשר~13 +עשרים~20 +עשרים ותשע~29 +עשרים ותשעה~29 +ארבעים~40 +מינוס ארבעים ושש~-46 +שבעים ושבעה~77 +מאה~100 +מאה ואחת~101 +מאה ועשר~110 +מאה ושש עשרה~116 +מאה עשרים~120 +מאה ועשרים~120 +כמאה עשרים וחמש~כ-125 +מאתיים~200 +מאתיים ושלוש~203 +מאתיים שלושים~230 +שלוש מאות ושלושים~330 +מינוס מאתיים שישים ושבע~-267 +ארבע מאות~400 +כחמש מאות עובדים~כ-500 עובדים +חמש מאות שבעים ותשע~579 +תשע מאות תשעים~990 +תשע מאות תשעים ותשע~999 +אלף~1,000 +אלף וארבע~1,004 +אלף עשרים ושמונה~1,028 +אלף מאה וחמש~1,105 +אלף מאה שלושים~1,130 +אלף תשע מאות תשעים ואחת~1,991 +אלפיים~2,000 +אלפיים וחמש~2,005 +אלפיים ועשר~2,010 +אלפיים ואחת עשרה~2,011 +אלפיים מאה~2,100 +אלפיים מאתיים~2,200 +מינוס אלפיים מאתיים עשרים ושתיים~-2,222 +אלפיים שלוש מאות~2,300 +אלפיים ארבע מאות ושבע~2,407 +מינוס אלפיים ארבע מאות שבעים~-2,470 +מינוס אלפיים ארבע מאות שבעים וחמש~-2,475 +שלושת אלפים~3,000 +שלושת אלפים וחמש~3,005 +שלושת אלפים ועשר~3,010 +שלושת אלפים וארבע עשרה~3,014 +שלושת אלפים מאה~3,100 +שלושת אלפים מאתיים~3,200 +מינוס שלושת אלפים שבע מאות עשרים ואחת~-3,721 +שלושת אלפים שמונה מאות~3,800 +שלושת אלפים ושמונה מאות~3,800 +שלושת אלפים תשע מאות ושבע~3,907 +מינוס שלושת אלפים מאתיים ועשרים~-3,220 +חמשת אלפים~5,000 +תשעת אלפים תשע מאות תשעים ותשע~9,999 +עשרת אלפים~10,000 +עשרת אלפים ואחת~10,001 +עשרת אלפים וחמש עשרה~10,015 +עשרת אלפים ועשרים~10,020 +עשרת אלפים עשרים ושלוש~10,023 +עשרת אלפים מאתיים~10,200 +עשרת אלפים מאתיים ואחד~10,201 +עשרת אלפים מאתיים ארבעים~10,240 +עשרת אלפים מאתיים וארבעים~10,240 +עשרת אלפים שלוש מאות חמישים~10,350 +עשרת אלפים שלוש מאות וחמישים~10,350 +שתיים עשרה אלף שש מאות~12,600 +שתיים עשרה אלף ושש מאות~12,600 +שתיים עשרה אלף שש מאות ואחת~12,601 +כשמונים ושבע אלף ועשר~כ-87,010 +תשעים ותשע אלף תשע מאות תשעים ותשע~99,999 +מאה אלף~100,000 +כמאה אלף תושבים~כ-100,000 תושבים +מאה אלף ושלוש~100,003 +מאה אלף ושתיים עשרה~100,012 +מאה אלף וארבעים~100,040 +מאה אלף ארבעים ושבע~100,047 +מאה אלף וארבעים ושבע~100,047 +מאה אלף ומאה~100,100 +מאה אלף מאה~100,100 +מאה אלף מאה שלושים ושלוש~100,133 +מאה ואחד אלף~101,000 +מאה ואחד אלף ואחת~101,001 +מאה ואחד אלף ועשר~101,010 +מאה ואחד אלף ואחת עשרה~101,011 +מאה ואחד אלף מאתיים~101,200 +כמאה ואחד אלף ומאתיים~כ-101,200 +מינוס מאה ואחת אלף מאתיים ועשרים~-101,220 +מינוס מאה ואחת אלף מאתיים עשרים~-101,220 +מינוס מאה ואחת אלף מאתיים עשרים ותשע~-101,229 +מינוס מאה ואחת אלף מאתיים עשרים ותשע~-101,229 +מאה ושתיים אלף~102,000 +מאה ושלוש אלף חמש מאות~103,500 +מאה ושלוש אלף וחמש מאות~103,500 +מאה וארבע אלף חמש מאות וארבע~104,504 +מאתיים ארבעים אלף~240,000 +מאתיים וארבעים אלף~240,000 +מאתיים חמישים וחמש אלף ושש~255,006 +מאתיים חמישים וחמש אלף וארבע מאות ושש~255,406 +מאתיים חמישים וחמש אלף ארבע מאות ושש~255,406 +חמש מאות חמישים וחמש אלף~555,000 +תשע מאות תשעים ותשע אלף תשע מאות תשעים ותשע~999,999 +מיליון~1,000,000 +מיליון ואחת~1,000,001 +מיליון ועשר~1,000,010 +מיליון חמש עשרה~1,000,015 +מיליון ושבעים~1,000,070 +מיליון שבעים~1,000,070 +מיליון ארבע מאות~1,000,400 +מיליון וארבע מאות~1,000,400 +מיליון ארבע מאות עשרים~1,000,420 +מיליון ארבע מאות ועשרים~1,000,420 +מיליון אלף~1,001,000 +מיליון שלושת אלפים~1,003,000 +מיליון ואלף~1,001,000 +מיליון אלף ואחת~1,001,001 +שלושה מיליון אלף~3,001,000 +שלושה מיליוןאלף וחמש~3,001,005 +שלושה מיליון ארבעים ושלוש אלף~3,043,000 +שלושה מיליון ארבעים ושלוש אלף ואחת~3,043,001 +שלושה מיליון ארבעים ושלוש אלף ושישים ואחת~3,043,061 +שלושה מיליון ארבעים ושלוש אלף שישים ואחת~3,043,061 +שלושה מיליון חמש מאות ארבעים ושלוש אלף~3,543,000 +שלושה מיליון חמש מאות ארבעים ושלוש אלף ושבע~3,543,007 +מינוס שלושה מיליון חמש מאות ארבעים ושלוש אלף ושבע~-3,543,007 +עשר מיליון~10 מיליון +עשרה מיליון~10 מיליון +עשרים מיליון~20 מיליון +חמש עשרה מיליון~15 מיליון +שלוש עשרה מיליון ארבעים ושלוש אלף~13,043,000 +מאה מיליון~100 מיליון +מאה עשרים ושתיים מיליון~122 מיליון +מאה עשרים ושתיים מיליון ושלוש עשרה~122,000,013 +מאה עשרים ושתיים מיליון חמישים אלף ושלוש עשרה~122,050,013 +שלוש אלף~3,000 diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..96b745de4 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,29 @@ +אחד במאי אלף תשע מאות שמונים ושלוש~1.5.1983 +השתיים עשרה לשתיים עשרה אלף תשע מאות תשעים ואחת~ה-12.12.1991 +השתיים עשרה בדצמבר אלף תשע מאות תשעים ואחת~ה-12.12.1991 +בינואר עשרים עשרים ואחת~בינואר 2021 +בשלישי לשלישי אלף תשע מאות תשעים~ב-3.3.1990 +העשירי באפריל~ה-10.4 +אחד במאי~1.5 +הראשון לחמישי~הראשון לחמישי +יוני אלפיים וחמש עשרה~יוני 2015 +ביוני אלפיים וחמש עשרה~ביוני 2015 +מתחיל בספטמבר עשרים עשרים~מתחיל בספטמבר 2020 +בשבעה עשר באוגוסט~ב-17.8 +בשבעה עשר באוגוסט עשרים שלושים~ב-17.8.2030 +בשבעה עשר לשמיני עשרים שלושים~ב-17.8.2030 +עשרים ושישי לרביעי עשרים עשרים וארבע~26.4.2024 +עשרים ושש לרביעי עשרים עשרים וארבע~26.4.2024 +עשרים ושישי לאפריל עשרים עשרים וארבע~26.4.2024 +עשרים ושש באפריל עשרים עשרים וארבע~26.4.2024 +עשרים ושישי לרביעי עשרים וארבע~26.4.24 +עשרים ושש לרביעי עשרים וארבע~26.4.24 +עשרים ושישי לאפריל עשרים וארבע~26.4.24 +עשרים ושש באפריל עשרים וארבע~26.4.24 +עשרים ושישה באפריל עשרים וארבע~26.4.24 +אנשים לא ידעו אחד מהשני~אנשים לא ידעו אחד מהשני +בשבעה באוקטובר~ב-7.10 +בשנת אלפיים וחמש~בשנת 2005 +משנת עשרים עשרים ואחת~משנת 2021 +השנה אלפיים ושלוש~השנה 2003 +שנת אלפיים וארבע~שנת 2004 \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..b864e264a --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,66 @@ +חמש נקודה שתיים מיליון~5.2 מיליון +מאה שישים וארבע נקודה חמישים ושמונה אלף~164.58 אלף +ארבע מאות מיליון~400 מיליון +חמישים מיליארד~50 מיליארד +ארבע מאות וחמש מיליארד~405 מיליארד +ארבע נקודה שמונים וחמש מיליארד~4.85 מיליארד +מאה מיליארד~100 מיליארד +מאה ועשר מיליארד~110 מיליארד +מאה שלושים ושתיים מיליארד~132 מיליארד +אחד נקודה שמונים וארבע מיליארד~1.84 מיליארד +אחד נקודה שמונים ואחת מיליארד~1.81 מיליארד +אחד נקודה חמש תשע מיליארד~1.59 מיליארד +אחד נקודה ארבע חמש שלוש מיליארד~1.453 מיליארד +אחד נקודה שבעים ושתיים מיליארד~1.72 מיליארד +אחד נקודה שתיים חמש מיליארד~1.25 מיליארד +שלוש עשרה מיליארד~13 מיליארד +שלושים מיליארד~30 מיליארד +אלפיים שמונה מאות וחמש נקודה שמונה שבע שלוש מיליון~2,805.873 מיליון +עשרה מיליון~10 מיליון +עשר מיליון~10 מיליון +חמש מיליון~5 מיליון +חמש מאות מיליון~500 מיליון +שתיים עשרה מיליון~12 מיליון +שניים עשר מיליון~12 מיליון +שלוש עשרה מיליון~13 מיליון +ארבע מיליון~4 מיליון +ארבעים וחמש מיליון~45 מיליון +חמש עשרה מיליארד~15 מיליארד +שני מיליון~2 מיליון +שתי מיליון~2 מיליון +שמונה מיליון~8 מיליון +מינוס שישים נקודה שתיים ארבע אפס אפס~-60.2400 +אפס נקודה עשרים ושש~0.26 +אפס נקודה שתיים שש~0.26 +שישים נקודה שתיים~60.2 +שמונה עשרה נקודה שמונים וחמש~18.85 +שמונה עשרה נקודה חמש אפס~18.50 +שמונה עשרה נקודה חמישים ושש~18.56 +שמונה עשרה נקודה תשע~18.9 +שמונה עשרה נקודה אפס חמש~18.05 +שמונה עשרה נקודה שתיים עשרה~18.12 +שמונה עשרה נקודה אפס אחד~18.01 +שמונה עשרה נקודה אפס אפס אפס~18.000 +שמונה עשרה נקודה שש~18.6 +שמונה עשרה נקודה שלוש אפס אפס~18.300 +שמונה עשרה נקודה שלושים ושש~18.36 +שמונה עשרה נקודה שתיים חמש~18.25 +שמונה עשרה נקודה עשרים ושתיים~18.22 +שמונה מאות ושמונה עשרה נקודה שלוש אפס שלוש~818.303 +שמונה מאות ושמונה נקודה שמונה~808.8 +שמונה מאות ושמונה נקודה אפס~808.0 +שמונה מאות שמונים ושמונה נקודה אחד~888.1 +שמונה מאות שמונים וארבע נקודה שלוש~884.3 +שמונה מאות שמונים ושתיים נקודה שמונה~882.8 +שמונה מאות שמונים ושתיים נקודה אפס~882.0 +שמונה מאות ארבעים וחמש נקודה תשעים וארבע~845.94 +שבעים ותשע וחצי~79.5 +שתיים ורבע~שתיים ורבע +שלוש ועשירית~3.1 +מינוס שלוש וחצי~-3.5 +עשר ושתי עשיריות~10.2 +שתיים ושלושת רבעי~2.75 +שתיים עשרה אלף ושתיים עשרה נקודה שתיים עשרה~12,012.12 +שתים עשרה אלף ושתים עשרה נקודה שתים עשרה~12,012.12 +שתיים ועשירית~2.1 +אחת ועשירית~1.1 diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..3d0a40a07 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,10 @@ +מינוס חמש עשרה אחוז~-15% +חמש עשרה אחוז~15% +מינוס שתים עשרה נקודה חמש מעלות ~-12.5° +שתיים עשרה נקודה חמש מעלות~12.5° +שתיים עשרה נקודה חמש מעלות צלסיוס~12.5°C +אלף אחוזים~1,000% +אחוז אחד~1% +מאתיים חמישים גרם~250 ג׳ +סנטימטר אחד~1 ס״מ +שלוש מיליגרם~3 מ״ג diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..0f6464445 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,34 @@ +בשעה חמש בצהריים~בשעה 17:00 בצהריים +בחמש בצהריים~ב-17:00 בצהריים +רבע לשש בבוקר~5:45 בבוקר +בתשע בבוקר~ב-9:00 בבוקר +השעה עשרים וחמישה לאחת בצהריים~השעה 12:35 בצהריים +נפגשנו באחת ושתי דקות בצהריים~נפגשנו ב-13:02 בצהריים +נפגשנו באחת ושלוש דקות בצהריים~נפגשנו ב-13:03 בצהריים +נפגשנו באחת וחמישה בצהריים~נפגשנו ב-13:05 בצהריים +שתיים ועשרה בבוקר~2:10 בבוקר +בשעה שתיים ועשרה בצהריים~בשעה 14:10 בצהריים +בשתיים ועשרה אחרי הצהריים~ב-14:10 אחרי הצהריים +שלוש ודקה בצהריים~15:01 בצהריים +ארבע ושלוש דקות אחרי הצהריים~16:03 אחרי הצהריים +שש ועשרים דקות בערב~18:20 בערב +בשש וחצי בערב~ב-18:30 בערב +חמישה לשלוש בבוקר~2:55 בבוקר +רבע לשש בערב~17:45 בערב +שלוש בצהריים~15:00 בצהריים +אחת לפנות בוקר~1:00 לפנות בוקר +אתמול בחמש אחרי הצהריים יצאנו עם אמא למכולת ובדרך ראינו שהגן שלנו סגור~אתמול ב-17:00 אחרי הצהריים יצאנו עם אמא למכולת ובדרך ראינו שהגן שלנו סגור +חמישה לחצות~23:55 +ברבע לחצות~ב-23:45 +בשעה חצות ועשרה~בשעה 0:10 +בחצות ודקה~ב-0:01 +חצות ושתיים עשרה דקות~0:12 +שלוש דקות לחצות~23:57 +חצות ושתי דקות~0:02 +חצות~0:00 +דקה לשלוש בצהריים~14:59 בצהריים +הפגישה זזה משבע בבוקר לשמונה וחצי בבוקר~הפגישה זזה מ-7:00 בבוקר ל-8:30 בבוקר +באחת בלילה~ב-1:00 בלילה +חמש לפנות ערב~17:00 לפנות ערב +בשלוש לפנות בוקר~ב-3:00 לפנות בוקר +עשרים לחמש אחרי הצהריים~16:40 אחרי הצהריים \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..67e4d6560 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,4 @@ +בשנת שבעים לפני הספירה~בשנת 70 לפנה״ס +יש מאתיים חמישים עורכי דין חדשים~יש 250 עו״ד חדשים +ישראל היא המדינה החמישים ואחת של ארצות הברית~ישראל היא המדינה ה-51 של ארה״ב +דוקטור שמילוביץ רשם לי תרופה חדשה~ד״ר שמילוביץ רשם לי תרופה חדשה \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt new file mode 100644 index 000000000..1bf28b0fb --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt @@ -0,0 +1,56 @@ +אתמול בשעה שבע וחצי בבוקר היה לי תור לרופא~אתמול בשעה 7:30 בבוקר היה לי תור לרופא +הגעתי למרפאה בשבע ורבע בבוקר כדי לא לאחר~הגעתי למרפאה ב-7:15 בבוקר כדי לא לאחר +אמרתי לרופא שאני בן חמישים ושלוש שאני נשוי ויש לי שלושה ילדים.~אמרתי לרופא שאני בן 53 שאני נשוי ויש לי שלושה ילדים. +אמרתי לו שיש לי כאבים ביד, אז הוא בדק אותי ~אמרתי לו שיש לי כאבים ביד, אז הוא בדק אותי +הוא אמר שיש חשד לשבר באמה של בין שני סנטימטר לארבע סנטימטר ושנצטרך לעשות צילום כדי לדעת~הוא אמר שיש חשד לשבר באמה של בין 2 ס״מ ל-4 ס״מ ושנצטרך לעשות צילום כדי לדעת +בינתיים הוא רשם לי עשר מיליגרם של משככי כאבים~בינתיים הוא רשם לי 10 מ״ג של משככי כאבים +הוא אמר שזה מאוד נפוץ ושליותר מעשר אחוז מהאוכלוסיה יש את זה~הוא אמר שזה מאוד נפוץ ושליותר מ-10% מהאוכלוסיה יש את זה +הוא העריך את סיכויי ההחלמה בשמונים ושלוש נקודה שש אחוז~הוא העריך את סיכויי ההחלמה ב-83.6% +בסוף המפגש הוא קבע ביקורת לשמיני באוגוסט~בסוף המפגש הוא קבע ביקורת ל-8.8 +יש לי חמישה תפוחים~יש לי חמישה תפוחים +אף אחד לא רוצה~אף אחד לא רוצה +בכל כיתה יש עשרים, עשרים ושתיים תלמידים~בכל כיתה יש 20 , 22 תלמידים +בכל כיתה יש עשרים - עשרים ושתיים תלמידים~בכל כיתה יש 20 - 22 תלמידים +אחת עשרה אלף שבע מאות חמישים ושש~11,756 +ע"פ הנתונים החדשים שקיבלנו הייתה עלייה של שלושים נקודה שתיים עשרה אחוז במכירות~ע"פ הנתונים החדשים שקיבלנו הייתה עלייה של 30.12% במכירות +אני בטוח בזה במאה אחוז~אני בטוח בזה ב-100% +יש לזה שלושים ותשע נקודה שישים ושבע אחוז הצלחה~יש לזה 39.67% הצלחה +יהי אפסילון אפס ויהי איקס~יהי אפסילון אפס ויהי איקס +לשתינו יש שתי בנות~לשתינו יש שתי בנות +היום יום שני ומחר יום שלישי~היום יום שני ומחר יום שלישי +שלוש וחצי קילוגרם~3.5 ק״ג +חמש ורבע סנטימטר~5.25 ס״מ +שמונה ושלושת רבעי~8.75 +שמונה ורבע מיליון~8.25 מיליון +שתיים וחצי~שתיים וחצי +שתיים וחצי מיליון~2.5 מיליון +בשתיים וחצי מיליון~ב-2.5 מיליון +שתיים וחצי בבוקר~2:30 בבוקר +מינוס שלוש וחצי אחוז~-3.5% +שלוש וחצי~שלוש וחצי +עשרת אלפים ומאתיים ארבעים~10,240 +אפס מאופס~אפס מאופס +הוא מתנהג כמו אפס.... בקיצור כל עניין האפס~הוא מתנהג כמו אפס.... בקיצור כל עניין האפס +מאה שישים וארבע נקודה חמישים ושמונה אלף~164.58 אלף +הפגישה זזה משבע וחצי בבוקר לשמונה~הפגישה זזה מ-7:30 בבוקר לשמונה +על סמך זה יצאנו ביום ראשון~על סמך זה יצאנו ביום ראשון +צעירים היו בגיל שלושים שלושים וחמש~צעירים היו בגיל 30 35 +אולי שניים שלושה~אולי שניים שלושה +בן הראשון שלי שנולד~בן הראשון שלי שנולד +אנחנו היינו איזה חמישה עשר איש~אנחנו היינו איזה 15 איש +התחילו לחזור וחזרו אחד אחד~התחילו לחזור וחזרו אחד אחד +וזה היה כבר אולי שעה תשע~וזה היה כבר אולי שעה תשע +אני מדבר על שמונה עשר באפריל~אני מדבר על 18.4 +שמונה עשר בינואר~18.1 +הייתה נראית כעת שתיים עשרה שלוש עשרה~הייתה נראית כעת 12 13 +היה בערך בעשירי בעשירי למאי~היה בערך בעשירי ב-10.5 +באמצע הלילה שתיים בלילה~באמצע הלילה 2:00 בלילה +למחרת בשעה חמש~למחרת בשעה חמש +בשנת אלף תשע מאות ארבעים ושמונה~בשנת 1948 +באלף תשע מאות ארבעים ושמונה~ב-1,948 +ארבע מאות וחמישים מיליגרם~450 מ״ג +ארבע וחצי~ארבע וחצי +יהי אפסילון אפס ויהי איקס~יהי אפסילון אפס ויהי איקס +לשתינו יש שתי בנות~לשתינו יש שתי בנות +מחר בשש וחצי בבוקר נעלה על האוטובוסים ונצא לטיול השנתי. בשעה שמונה נגיע למצדה ונתחיל לטפס למעלה, נהיה שם עד אחת וחצי בצהריים, אולי רבע לשתיים, ונרד בשביל הנחש~מחר ב-6:30 בבוקר נעלה על האוטובוסים ונצא לטיול השנתי. בשעה שמונה נגיע למצדה ונתחיל לטפס למעלה, נהיה שם עד 13:30 בצהריים , אולי 1:45 , ונרד בשביל הנחש +יש לי חמישה תפוחים~יש לי חמישה תפוחים \ No newline at end of file diff --git a/tests/nemo_text_processing/he/test_cardinal.py b/tests/nemo_text_processing/he/test_cardinal.py new file mode 100644 index 000000000..4700725b1 --- /dev/null +++ b/tests/nemo_text_processing/he/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_date.py b/tests/nemo_text_processing/he/test_date.py new file mode 100644 index 000000000..73c183e7b --- /dev/null +++ b/tests/nemo_text_processing/he/test_date.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_decimal.py b/tests/nemo_text_processing/he/test_decimal.py new file mode 100644 index 000000000..125fc31d0 --- /dev/null +++ b/tests/nemo_text_processing/he/test_decimal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_full_sentences.py b/tests/nemo_text_processing/he/test_full_sentences.py new file mode 100644 index 000000000..0bc9251a7 --- /dev/null +++ b/tests/nemo_text_processing/he/test_full_sentences.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFullSentences: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_full_sentences.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_measure.py b/tests/nemo_text_processing/he/test_measure.py new file mode 100644 index 000000000..1649effa7 --- /dev/null +++ b/tests/nemo_text_processing/he/test_measure.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..bce2e24b9 --- /dev/null +++ b/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,61 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + + +testITNTime() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + + +testITNWhitelist() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/he/test_time.py b/tests/nemo_text_processing/he/test_time.py new file mode 100644 index 000000000..f3bba67b5 --- /dev/null +++ b/tests/nemo_text_processing/he/test_time.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer_en = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_whitelist.py b/tests/nemo_text_processing/he/test_whitelist.py new file mode 100644 index 000000000..fb14c2a58 --- /dev/null +++ b/tests/nemo_text_processing/he/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt index 6ba21de69..2a52b2a20 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -143,3 +143,5 @@ ११०२२३४५५६७~ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ ५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ २ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल +०५~शून्य पाँच +०१~शून्य एक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..9bdcab2a4 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,62 @@ +१ला~पहला +१ली~पहली +२रा~दूसरा +२री~दूसरी +३रा~तीसरा +३री~तीसरी +४था~चौथा +४थी~चौथी +५वां~पाँचवां +५वीं~पाँचवीं +६ठा~छठा +६ठी~छठी +७वां~सातवां +७वीं~सातवीं +८वां~आठवां +८वीं~आठवीं +९वां~नौवां +९वीं~नौवीं +११वां~ग्यारहवां +१२वीं~बारहवीं +१४वां~चौदहवां +१६वीं~सोलहवीं +१७वां~सत्रहवां +१८वीं~अठारहवीं +१९वां~उन्नीसवां +२०वां~बीसवां +२१वां~इक्कीसवां +२५वीं~पच्चीसवीं +२७वें~सत्ताईसवें +३०वीं~तीसवीं +३३वां~तैंतीसवां +४०वीं~चालीसवीं +४५वां~पैंतालीसवां +५०वां~पचासवां +५६वें~छप्पनवें +६०वां~साठवां +६७वीं~सड़सठवीं +७५वीं~पचहत्तरवीं +८०वें~अस्सीवें +८८वां~अट्ठासीवां +९१वीं~इक्यानबेवीं +९९वां~निन्यानबेवां +१००वां~एक सौवां +१०१वां~एक सौ एकवां +१११वीं~एक सौ ग्यारहवीं +१२५वें~एक सौ पच्चीसवें +१५३वीं~एक सौ तिरेपनवीं +२००वीं~दो सौवीं +२१९वीं~दो सौ उन्नीसवीं +२४०वां~दो सौ चालीसवां +३२९वां~तीन सौ उनतीसवां +३६५वां~तीन सौ पैंसठवां +४५५वां~चार सौ पचपनवां +५५५वीं~पाँच सौ पचपनवीं +६४०वीं~छह सौ चालीसवीं +८९०वां~आठ सौ नब्बेवां +१००१वीं~एक हज़ार एकवीं +१०९१वें~एक हज़ार इक्यानबेवें +१७८२वीं~सत्रह सौ बयासीवीं +१८९०वां~एक हज़ार आठ सौ नब्बेवां +१९८१वीं~उन्नीस सौ इक्यासीवीं +९८२६वीं~अट्ठानबे सौ छब्बीसवीं \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..7a1b2c662 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,25 @@ +मेरा पुराना नंबर था ९१५७११४००७~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो ०३८६२-३५१७९१~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो १३७४-३०९९८८~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो ०१६८९११-४५७३~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++९१ ७४४०४३१०८३ मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++९१ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक १२३४ दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड ११००२३ है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है +मेरा पुराना नंबर था 9157114007~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो 03862-351791~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो 1374 309988~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो 0168911-4573~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++91 7440431083 मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++91 9210515606 मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक 1234 दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड 110023 है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है ++1 9210515606 मेरे इस नंबर पे कॉल करो~प्लस एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++४९ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस चार नौ नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++353 9210515606 मेरे इस नंबर पे कॉल करो~प्लस तीन पाँच तीन नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++91 9876543210 123~प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य एक दो तीन ++1 6234517890 123~प्लस एक छह दो तीन चार पाँच एक सात आठ नौ शून्य एक दो तीन ++९१ ९८७६५४३२१० १२३~प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य एक दो तीन +(02229) 411128~शून्य दो दो दो नौ चार एक एक एक दो आठ +०२२.२९४१११२८~शून्य दो दो दो नौ चार एक एक एक दो आठ +0 (80) 26411128~शून्य आठ शून्य दो छह चार एक एक एक दो आठ \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_ordinal.py b/tests/nemo_text_processing/hi/test_ordinal.py index b65252694..3e5f4bfbb 100644 --- a/tests/nemo_text_processing/hi/test_ordinal.py +++ b/tests/nemo_text_processing/hi/test_ordinal.py @@ -17,13 +17,24 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestOrdinal: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 498443f71..a0b0931e2 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -76,15 +76,15 @@ testTNMoney() { runtest $input } -#testTNOrdinal() { -# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt -# runtest $input -#} +testTNOrdinal() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt + runtest $input +} -#testTNTelephone() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_telephone.txt -# runtest $input -#} +testTNTelephone() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_telephone.txt + runtest $input +} testTNTime() { input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index 7e43f7e82..e7b9f1c3d 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -16,12 +16,16 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestTelephone: inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +33,10 @@ class TestTelephone: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..846973eee 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -101,6 +101,7 @@ def parse_args(): 'ar', 'it', 'es_en', + 'he', 'hi', 'hy', 'mr', @@ -283,6 +284,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'he': + from nemo_text_processing.inverse_text_normalization.he.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -312,6 +320,8 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + else: + raise KeyError(f"Language {args.language} is not defined for export.") output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir,