From 6b08af63a6c6514c8480d157a5d13136dca24aaf Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 3 Feb 2023 12:33:42 +0530 Subject: [PATCH 1/7] Add additional units and plurals Signed-off-by: Anand Joseph --- .../en/data/measurements.tsv | 37 ++++++++++++++++--- .../text_normalization/en/data/suppletive.tsv | 26 +++++++++++++ 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv index 6801b7b42..05216d27b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv +++ b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv @@ -1,5 +1,8 @@ -f fahrenheit -c celsius +°f fahrenheit +°c celsius +°f degree fahrenheit +°c degree celsius +k kelvin km kilometer m meter cm centimeter @@ -35,7 +38,6 @@ mv milli volt mw megawatt μm micrometer " inch -tb terabyte cc c c g gram da dalton @@ -47,13 +49,38 @@ oz ounce hl hecto liter μg microgram pg petagram -gb gigabyte kb kilobit +mb megabit +gb gigabit +tb terabit +pb petabit ev electron volt mb megabyte kb kilobyte +gb gigabyte +tb terabyte +pb peta byte +bps bit per second kbps kilobit per second mbps megabit per second +gbps gigabit per second +kbps kilo bit per second +mbps mega bit per second +mbps mega bit per second +tbps terabit per second +tbps tera bit per second +pbps petabit per second +pbps peta bit per second +kb/s kilobyte per second +kb/s kilo byte per second +mb/s megabyte per second +mb/s mega byte per second +gb/s gigabyte per second +gb/s giga byte per second +tb/s terabyte per second +tb/s tera byte per second +pb/s petabyte per second +pb/s peta byte per second st stone kl kilo liter tj tera joule @@ -86,7 +113,6 @@ ms milli second dm deci meter dm³ cubic deci meter amu atomic mass unit -mb megabit mf mega farad bq becquerel pb petabit @@ -100,7 +126,6 @@ tl tera liter ms mega second mpa megapascal pm peta meter -pb peta byte gwh giga watt hour kcal kilo calory gy gray diff --git a/nemo_text_processing/text_normalization/en/data/suppletive.tsv b/nemo_text_processing/text_normalization/en/data/suppletive.tsv index 115460aa2..9b853f397 100644 --- a/nemo_text_processing/text_normalization/en/data/suppletive.tsv +++ b/nemo_text_processing/text_normalization/en/data/suppletive.tsv @@ -36,9 +36,35 @@ revolution per minute revolutions per minute mile per hour miles per hour megabit per second megabits per second square foot square feet +centimeter per second centimeters per second +meter per second meters per second +kilometer per second kilometers per second +meter per hour meters per hour +bit per second bits per second +kilometer per hour kilometers per hour kilobit per second kilobits per second +kilo bit per second kilo bits per second +megabit per second megabits per second +mega bit per second mega bits per second +terabit per second terabits per second +tera bit per second tera bits per second +petabit per second petabits per second +peta bit per second peta bits per second +byte per second bytes per second +kilobyte per second kilobytes per second +kilo byte per second kilo bytes per second +megabyte per second megabytes per second +mega byte per second mega bytes per second +gigabyte per second gigabytes per second +giga byte per second giga bytes per second +terabyte per second terabytes per second +tera byte per second tera bytes per second +petabyte per second petabytes per second +peta byte per second peta bytes per second degree Celsius degrees Celsius degree Fahrenheit degrees Fahrenheit +degree celsius degrees celsius +degree fahrenheit degrees fahrenheit ATM AU BQ From c187b80c9883d8847ff5f0a4df56dbf41cf7765b Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 3 Feb 2023 18:20:14 +0530 Subject: [PATCH 2/7] Add support for financial periods (1H22, 2Q19) Signed-off-by: Anand Joseph --- .../text_normalization/en/taggers/date.py | 21 ++++++++++++++++--- .../text_normalization/en/verbalizers/date.py | 16 ++++++++++---- .../test_cases_date.txt | 3 ++- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 45031b6e5..3b22c83a0 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -138,9 +138,21 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): def _get_two_digit_year(cardinal_graph, single_digits_graph): - wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA) - return wo_digit_year + two_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA) + return two_digit_year +def _get_financial_period_graph(): + # 1H23 -> first half of twenty three + # 3Q22 -> third quarter of twenty two + + h_ordinals=pynini.cross('1','first') | pynini.cross('2','second') + q_ordinals = h_ordinals | pynini.cross('3','third') | pynini.cross('4','fourth') + + h_graph = h_ordinals+pynini.cross('H',' half') + q_graph = q_ordinals+pynini.cross('Q',' quarter') + period_graph = h_graph|q_graph + + return period_graph class DateFst(GraphFst): """ @@ -297,7 +309,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): else: final_graph += pynutil.insert(" preserve_order: true") - final_graph |= graph_ymd | year_graph + period_fy = pynutil.insert("period: \"") + _get_financial_period_graph() + pynutil.insert("\"") + graph_fy = period_fy + insert_space + two_digit_year + + final_graph |= graph_ymd | year_graph | graph_fy if not deterministic or lm: ymd_to_mdy_graph = None diff --git a/nemo_text_processing/text_normalization/en/verbalizers/date.py b/nemo_text_processing/text_normalization/en/verbalizers/date.py index 191d01063..e0a48c1c2 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/date.py @@ -39,7 +39,7 @@ class DateFst(GraphFst): def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="date", kind="verbalize", deterministic=deterministic) - month = pynini.closure(NEMO_NOT_QUOTE, 1) + phrase = pynini.closure(NEMO_NOT_QUOTE, 1) day_cardinal = ( pynutil.delete("day:") + delete_space @@ -48,8 +48,8 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal + pynutil.delete("\"") ) day = day_cardinal @ ordinal.suffix - - month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"") + period = pynutil.delete("period:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"") + month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"") year = ( pynutil.delete("year:") @@ -60,6 +60,14 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal + pynutil.delete("\"") ) + # financial period + graph_fy = ( + pynutil.insert("the ") + + period + + pynutil.insert(" of ") + + pynini.closure(delete_extra_space + year, 0, 1) + ) + # month (day) year graph_mdy = ( month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1) @@ -93,7 +101,7 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal ) final_graph = ( - (plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year) + (plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year | graph_fy) + delete_space + optional_preserve_order ) diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt index 8b97fce4a..95bfb9172 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt @@ -50,4 +50,5 @@ Jan-15~january fifteenth 15-01-2020~the fifteenth of january twenty twenty 15.01.2020~the fifteenth of january twenty twenty 340 A.D~three forty AD -1998/2/30~february thirtieth nineteen ninety eight \ No newline at end of file +1998/2/30~february thirtieth nineteen ninety eight +We have seen YoY growth in 2Q22~We have seen YoY growth in the second quarter of twenty two \ No newline at end of file From ee9e63ac6d62b334d225afd14bc21d47267765ac Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 3 Feb 2023 18:28:30 +0530 Subject: [PATCH 3/7] Add missing plural for "gigabit per second" Signed-off-by: Anand Joseph --- .../text_normalization/en/data/suppletive.tsv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/en/data/suppletive.tsv b/nemo_text_processing/text_normalization/en/data/suppletive.tsv index 9b853f397..939d2e249 100644 --- a/nemo_text_processing/text_normalization/en/data/suppletive.tsv +++ b/nemo_text_processing/text_normalization/en/data/suppletive.tsv @@ -46,6 +46,8 @@ kilobit per second kilobits per second kilo bit per second kilo bits per second megabit per second megabits per second mega bit per second mega bits per second +gigabit per second gigabits per second +giga bit per second giga bits per second terabit per second terabits per second tera bit per second tera bits per second petabit per second petabits per second @@ -106,4 +108,4 @@ PS S TB YB -ZB \ No newline at end of file +ZB From faa28a316ee39e672dd96e83a5e79430f7a37408 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 3 Feb 2023 19:46:20 +0530 Subject: [PATCH 4/7] Fix for measures Signed-off-by: Anand Joseph --- .../inverse_text_normalization/en/data/measurements.tsv | 2 +- .../text_normalization/en/data/suppletive.tsv | 1 + .../data_inverse_text_normalization/test_cases_measure.txt | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv index 05216d27b..c9672adf2 100644 --- a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv +++ b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv @@ -2,7 +2,7 @@ °c celsius °f degree fahrenheit °c degree celsius -k kelvin +k kelvin km kilometer m meter cm centimeter diff --git a/nemo_text_processing/text_normalization/en/data/suppletive.tsv b/nemo_text_processing/text_normalization/en/data/suppletive.tsv index 939d2e249..4718e27bd 100644 --- a/nemo_text_processing/text_normalization/en/data/suppletive.tsv +++ b/nemo_text_processing/text_normalization/en/data/suppletive.tsv @@ -67,6 +67,7 @@ degree Celsius degrees Celsius degree Fahrenheit degrees Fahrenheit degree celsius degrees celsius degree fahrenheit degrees fahrenheit +kelvin kelvin ATM AU BQ diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt index 4f6f540e6..91cfb7470 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt @@ -110,3 +110,8 @@ eight point five megawatts~8.5 mw eight point five meters~8.5 m eight point five two percent~8.52 % eight point four four percent~8.44 % +one gigabit per second~1 gbps +nine gigabits per second~9 gbps +five degrees celsius~5 °c +seventy two degrees fahrenheit~72 °f +two hundred seventy three kelvin~273 k From 16dfae767cbc3292d6ad3b35f8e9855fdfe80d70 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 3 Feb 2023 19:52:28 +0530 Subject: [PATCH 5/7] Use environment variables to set path of fst cache Signed-off-by: Anand Joseph --- Jenkinsfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7d9aa2741..71e525860 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,9 @@ pipeline { timeout(time: 2, unit: 'HOURS') disableConcurrentBuilds(abortPrevious: true) } - + environment { + EN_TN_CACHE=/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1 + } stages { stage('Add git safe directory'){ @@ -53,17 +55,17 @@ pipeline { parallel { stage('L0: En TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}' } } stage('L0: En TN non-deterministic grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir ${EN_TN_CACHE}' } } stage('L0: En ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir ${EN_TN_CACHE}' } } @@ -81,7 +83,7 @@ pipeline { parallel { stage('L1: Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') { steps { - sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir ${EN_TN_CACHE}' } } } @@ -99,7 +101,7 @@ pipeline { stage('L2: Eng TN') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \ - cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1' + cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1' sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \ cat $NORM_OUTPUT_DIR/test.pynini.txt && \ @@ -111,7 +113,7 @@ pipeline { stage('L2: Eng ITN export') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \ - cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1' + cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1' sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \ cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \ From b8cdc3a406c4fd1fbb2af248719f0036df7acddc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Feb 2023 14:25:39 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/en/taggers/date.py | 12 +++++++----- .../text_normalization/en/verbalizers/date.py | 5 +---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 3b22c83a0..2a9b4507b 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -141,19 +141,21 @@ def _get_two_digit_year(cardinal_graph, single_digits_graph): two_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA) return two_digit_year + def _get_financial_period_graph(): # 1H23 -> first half of twenty three # 3Q22 -> third quarter of twenty two - h_ordinals=pynini.cross('1','first') | pynini.cross('2','second') - q_ordinals = h_ordinals | pynini.cross('3','third') | pynini.cross('4','fourth') + h_ordinals = pynini.cross('1', 'first') | pynini.cross('2', 'second') + q_ordinals = h_ordinals | pynini.cross('3', 'third') | pynini.cross('4', 'fourth') - h_graph = h_ordinals+pynini.cross('H',' half') - q_graph = q_ordinals+pynini.cross('Q',' quarter') - period_graph = h_graph|q_graph + h_graph = h_ordinals + pynini.cross('H', ' half') + q_graph = q_ordinals + pynini.cross('Q', ' quarter') + period_graph = h_graph | q_graph return period_graph + class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. diff --git a/nemo_text_processing/text_normalization/en/verbalizers/date.py b/nemo_text_processing/text_normalization/en/verbalizers/date.py index e0a48c1c2..780b2681a 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/date.py @@ -62,10 +62,7 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal # financial period graph_fy = ( - pynutil.insert("the ") - + period - + pynutil.insert(" of ") - + pynini.closure(delete_extra_space + year, 0, 1) + pynutil.insert("the ") + period + pynutil.insert(" of ") + pynini.closure(delete_extra_space + year, 0, 1) ) # month (day) year From 82a956d8b2bb06a1599c8f804c01e76f291b315e Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 3 Feb 2023 19:58:28 +0530 Subject: [PATCH 7/7] Fix environment variable Signed-off-by: Anand Joseph --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 71e525860..05188e114 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -10,7 +10,7 @@ pipeline { disableConcurrentBuilds(abortPrevious: true) } environment { - EN_TN_CACHE=/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1 + EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1' } stages {