diff --git a/Jenkinsfile b/Jenkinsfile index 7d9aa2741..05188e114 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,9 @@ pipeline { timeout(time: 2, unit: 'HOURS') disableConcurrentBuilds(abortPrevious: true) } - + environment { + EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1' + } stages { stage('Add git safe directory'){ @@ -53,17 +55,17 @@ pipeline { parallel { stage('L0: En TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}' } } stage('L0: En TN non-deterministic grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir ${EN_TN_CACHE}' } } stage('L0: En ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir ${EN_TN_CACHE}' } } @@ -81,7 +83,7 @@ pipeline { parallel { stage('L1: Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') { steps { - sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3' + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir ${EN_TN_CACHE}' } } } @@ -99,7 +101,7 @@ pipeline { stage('L2: Eng TN') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \ - cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1' + cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1' sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \ cat $NORM_OUTPUT_DIR/test.pynini.txt && \ @@ -111,7 +113,7 @@ pipeline { stage('L2: Eng ITN export') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \ - cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1' + cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1' sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \ cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \ diff --git a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv index 6801b7b42..c9672adf2 100644 --- a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv +++ b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv @@ -1,5 +1,8 @@ -f fahrenheit -c celsius +°f fahrenheit +°c celsius +°f degree fahrenheit +°c degree celsius +k kelvin km kilometer m meter cm centimeter @@ -35,7 +38,6 @@ mv milli volt mw megawatt μm micrometer " inch -tb terabyte cc c c g gram da dalton @@ -47,13 +49,38 @@ oz ounce hl hecto liter μg microgram pg petagram -gb gigabyte kb kilobit +mb megabit +gb gigabit +tb terabit +pb petabit ev electron volt mb megabyte kb kilobyte +gb gigabyte +tb terabyte +pb peta byte +bps bit per second kbps kilobit per second mbps megabit per second +gbps gigabit per second +kbps kilo bit per second +mbps mega bit per second +mbps mega bit per second +tbps terabit per second +tbps tera bit per second +pbps petabit per second +pbps peta bit per second +kb/s kilobyte per second +kb/s kilo byte per second +mb/s megabyte per second +mb/s mega byte per second +gb/s gigabyte per second +gb/s giga byte per second +tb/s terabyte per second +tb/s tera byte per second +pb/s petabyte per second +pb/s peta byte per second st stone kl kilo liter tj tera joule @@ -86,7 +113,6 @@ ms milli second dm deci meter dm³ cubic deci meter amu atomic mass unit -mb megabit mf mega farad bq becquerel pb petabit @@ -100,7 +126,6 @@ tl tera liter ms mega second mpa megapascal pm peta meter -pb peta byte gwh giga watt hour kcal kilo calory gy gray diff --git a/nemo_text_processing/text_normalization/en/data/suppletive.tsv b/nemo_text_processing/text_normalization/en/data/suppletive.tsv index 115460aa2..4718e27bd 100644 --- a/nemo_text_processing/text_normalization/en/data/suppletive.tsv +++ b/nemo_text_processing/text_normalization/en/data/suppletive.tsv @@ -36,9 +36,38 @@ revolution per minute revolutions per minute mile per hour miles per hour megabit per second megabits per second square foot square feet +centimeter per second centimeters per second +meter per second meters per second +kilometer per second kilometers per second +meter per hour meters per hour +bit per second bits per second +kilometer per hour kilometers per hour kilobit per second kilobits per second +kilo bit per second kilo bits per second +megabit per second megabits per second +mega bit per second mega bits per second +gigabit per second gigabits per second +giga bit per second giga bits per second +terabit per second terabits per second +tera bit per second tera bits per second +petabit per second petabits per second +peta bit per second peta bits per second +byte per second bytes per second +kilobyte per second kilobytes per second +kilo byte per second kilo bytes per second +megabyte per second megabytes per second +mega byte per second mega bytes per second +gigabyte per second gigabytes per second +giga byte per second giga bytes per second +terabyte per second terabytes per second +tera byte per second tera bytes per second +petabyte per second petabytes per second +peta byte per second peta bytes per second degree Celsius degrees Celsius degree Fahrenheit degrees Fahrenheit +degree celsius degrees celsius +degree fahrenheit degrees fahrenheit +kelvin kelvin ATM AU BQ @@ -80,4 +109,4 @@ PS S TB YB -ZB \ No newline at end of file +ZB diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 45031b6e5..2a9b4507b 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -138,8 +138,22 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): def _get_two_digit_year(cardinal_graph, single_digits_graph): - wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA) - return wo_digit_year + two_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA) + return two_digit_year + + +def _get_financial_period_graph(): + # 1H23 -> first half of twenty three + # 3Q22 -> third quarter of twenty two + + h_ordinals = pynini.cross('1', 'first') | pynini.cross('2', 'second') + q_ordinals = h_ordinals | pynini.cross('3', 'third') | pynini.cross('4', 'fourth') + + h_graph = h_ordinals + pynini.cross('H', ' half') + q_graph = q_ordinals + pynini.cross('Q', ' quarter') + period_graph = h_graph | q_graph + + return period_graph class DateFst(GraphFst): @@ -297,7 +311,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): else: final_graph += pynutil.insert(" preserve_order: true") - final_graph |= graph_ymd | year_graph + period_fy = pynutil.insert("period: \"") + _get_financial_period_graph() + pynutil.insert("\"") + graph_fy = period_fy + insert_space + two_digit_year + + final_graph |= graph_ymd | year_graph | graph_fy if not deterministic or lm: ymd_to_mdy_graph = None diff --git a/nemo_text_processing/text_normalization/en/verbalizers/date.py b/nemo_text_processing/text_normalization/en/verbalizers/date.py index 191d01063..780b2681a 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/date.py @@ -39,7 +39,7 @@ class DateFst(GraphFst): def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="date", kind="verbalize", deterministic=deterministic) - month = pynini.closure(NEMO_NOT_QUOTE, 1) + phrase = pynini.closure(NEMO_NOT_QUOTE, 1) day_cardinal = ( pynutil.delete("day:") + delete_space @@ -48,8 +48,8 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal + pynutil.delete("\"") ) day = day_cardinal @ ordinal.suffix - - month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"") + period = pynutil.delete("period:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"") + month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"") year = ( pynutil.delete("year:") @@ -60,6 +60,11 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal + pynutil.delete("\"") ) + # financial period + graph_fy = ( + pynutil.insert("the ") + period + pynutil.insert(" of ") + pynini.closure(delete_extra_space + year, 0, 1) + ) + # month (day) year graph_mdy = ( month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1) @@ -93,7 +98,7 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal ) final_graph = ( - (plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year) + (plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year | graph_fy) + delete_space + optional_preserve_order ) diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt index 4f6f540e6..91cfb7470 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt @@ -110,3 +110,8 @@ eight point five megawatts~8.5 mw eight point five meters~8.5 m eight point five two percent~8.52 % eight point four four percent~8.44 % +one gigabit per second~1 gbps +nine gigabits per second~9 gbps +five degrees celsius~5 °c +seventy two degrees fahrenheit~72 °f +two hundred seventy three kelvin~273 k diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt index 8b97fce4a..95bfb9172 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt @@ -50,4 +50,5 @@ Jan-15~january fifteenth 15-01-2020~the fifteenth of january twenty twenty 15.01.2020~the fifteenth of january twenty twenty 340 A.D~three forty AD -1998/2/30~february thirtieth nineteen ninety eight \ No newline at end of file +1998/2/30~february thirtieth nineteen ninety eight +We have seen YoY growth in 2Q22~We have seen YoY growth in the second quarter of twenty two \ No newline at end of file