Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ pipeline {
timeout(time: 2, unit: 'HOURS')
disableConcurrentBuilds(abortPrevious: true)
}

environment {
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1'
}
stages {

stage('Add git safe directory'){
Expand Down Expand Up @@ -53,17 +55,17 @@ pipeline {
parallel {
stage('L0: En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}'
}
}
stage('L0: En TN non-deterministic grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir ${EN_TN_CACHE}'
}
}
stage('L0: En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir ${EN_TN_CACHE}'
}
}

Expand All @@ -81,7 +83,7 @@ pipeline {
parallel {
stage('L1: Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir ${EN_TN_CACHE}'
}
}
}
Expand All @@ -99,7 +101,7 @@ pipeline {
stage('L2: Eng TN') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \
cat $NORM_OUTPUT_DIR/test.pynini.txt && \
Expand All @@ -111,7 +113,7 @@ pipeline {
stage('L2: Eng ITN export') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
f fahrenheit
c celsius
°f fahrenheit
°c celsius
°f degree fahrenheit
°c degree celsius
k kelvin
km kilometer
m meter
cm centimeter
Expand Down Expand Up @@ -35,7 +38,6 @@ mv milli volt
mw megawatt
μm micrometer
" inch
tb terabyte
cc c c
g gram
da dalton
Expand All @@ -47,13 +49,38 @@ oz ounce
hl hecto liter
μg microgram
pg petagram
gb gigabyte
kb kilobit
mb megabit
gb gigabit
tb terabit
pb petabit
ev electron volt
mb megabyte
kb kilobyte
gb gigabyte
tb terabyte
pb peta byte
bps bit per second
kbps kilobit per second
mbps megabit per second
gbps gigabit per second
kbps kilo bit per second
mbps mega bit per second
mbps mega bit per second
tbps terabit per second
tbps tera bit per second
pbps petabit per second
pbps peta bit per second
kb/s kilobyte per second
kb/s kilo byte per second
mb/s megabyte per second
mb/s mega byte per second
gb/s gigabyte per second
gb/s giga byte per second
tb/s terabyte per second
tb/s tera byte per second
pb/s petabyte per second
pb/s peta byte per second
st stone
kl kilo liter
tj tera joule
Expand Down Expand Up @@ -86,7 +113,6 @@ ms milli second
dm deci meter
dm³ cubic deci meter
amu atomic mass unit
mb megabit
mf mega farad
bq becquerel
pb petabit
Expand All @@ -100,7 +126,6 @@ tl tera liter
ms mega second
mpa megapascal
pm peta meter
pb peta byte
gwh giga watt hour
kcal kilo calory
gy gray
Expand Down
31 changes: 30 additions & 1 deletion nemo_text_processing/text_normalization/en/data/suppletive.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,38 @@ revolution per minute revolutions per minute
mile per hour miles per hour
megabit per second megabits per second
square foot square feet
centimeter per second centimeters per second
meter per second meters per second
kilometer per second kilometers per second
meter per hour meters per hour
bit per second bits per second
kilometer per hour kilometers per hour
kilobit per second kilobits per second
kilo bit per second kilo bits per second
megabit per second megabits per second
mega bit per second mega bits per second
gigabit per second gigabits per second
giga bit per second giga bits per second
terabit per second terabits per second
tera bit per second tera bits per second
petabit per second petabits per second
peta bit per second peta bits per second
byte per second bytes per second
kilobyte per second kilobytes per second
kilo byte per second kilo bytes per second
megabyte per second megabytes per second
mega byte per second mega bytes per second
gigabyte per second gigabytes per second
giga byte per second giga bytes per second
terabyte per second terabytes per second
tera byte per second tera bytes per second
petabyte per second petabytes per second
peta byte per second peta bytes per second
degree Celsius degrees Celsius
degree Fahrenheit degrees Fahrenheit
degree celsius degrees celsius
degree fahrenheit degrees fahrenheit
kelvin kelvin
ATM
AU
BQ
Expand Down Expand Up @@ -80,4 +109,4 @@ PS
S
TB
YB
ZB
ZB
23 changes: 20 additions & 3 deletions nemo_text_processing/text_normalization/en/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,22 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True):


def _get_two_digit_year(cardinal_graph, single_digits_graph):
wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
return wo_digit_year
two_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
return two_digit_year


def _get_financial_period_graph():
# 1H23 -> first half of twenty three
# 3Q22 -> third quarter of twenty two

h_ordinals = pynini.cross('1', 'first') | pynini.cross('2', 'second')
q_ordinals = h_ordinals | pynini.cross('3', 'third') | pynini.cross('4', 'fourth')

h_graph = h_ordinals + pynini.cross('H', ' half')
q_graph = q_ordinals + pynini.cross('Q', ' quarter')
period_graph = h_graph | q_graph

return period_graph


class DateFst(GraphFst):
Expand Down Expand Up @@ -297,7 +311,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
else:
final_graph += pynutil.insert(" preserve_order: true")

final_graph |= graph_ymd | year_graph
period_fy = pynutil.insert("period: \"") + _get_financial_period_graph() + pynutil.insert("\"")
graph_fy = period_fy + insert_space + two_digit_year

final_graph |= graph_ymd | year_graph | graph_fy

if not deterministic or lm:
ymd_to_mdy_graph = None
Expand Down
13 changes: 9 additions & 4 deletions nemo_text_processing/text_normalization/en/verbalizers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class DateFst(GraphFst):
def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = False):
super().__init__(name="date", kind="verbalize", deterministic=deterministic)

month = pynini.closure(NEMO_NOT_QUOTE, 1)
phrase = pynini.closure(NEMO_NOT_QUOTE, 1)
day_cardinal = (
pynutil.delete("day:")
+ delete_space
Expand All @@ -48,8 +48,8 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal
+ pynutil.delete("\"")
)
day = day_cardinal @ ordinal.suffix

month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"")
period = pynutil.delete("period:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"")
month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"")

year = (
pynutil.delete("year:")
Expand All @@ -60,6 +60,11 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal
+ pynutil.delete("\"")
)

# financial period
graph_fy = (
pynutil.insert("the ") + period + pynutil.insert(" of ") + pynini.closure(delete_extra_space + year, 0, 1)
)

# month (day) year
graph_mdy = (
month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1)
Expand Down Expand Up @@ -93,7 +98,7 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal
)

final_graph = (
(plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year)
(plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year | graph_fy)
+ delete_space
+ optional_preserve_order
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,8 @@ eight point five megawatts~8.5 mw
eight point five meters~8.5 m
eight point five two percent~8.52 %
eight point four four percent~8.44 %
one gigabit per second~1 gbps
nine gigabits per second~9 gbps
five degrees celsius~5 °c
seventy two degrees fahrenheit~72 °f
two hundred seventy three kelvin~273 k
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@ Jan-15~january fifteenth
15-01-2020~the fifteenth of january twenty twenty
15.01.2020~the fifteenth of january twenty twenty
340 A.D~three forty AD
1998/2/30~february thirtieth nineteen ninety eight
1998/2/30~february thirtieth nineteen ninety eight
We have seen YoY growth in 2Q22~We have seen YoY growth in the second quarter of twenty two