Skip to content

Commit

Permalink
refactor tn data folder, and update of measure (#4028)
Browse files Browse the repository at this point in the history
* refactor tn data folder, and update of measure

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* udpate jenkins

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* added whitelist with spaces for asr

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>
  • Loading branch information
yzhang123 committed Apr 21, 2022
1 parent ee2f9e2 commit ea201d2
Show file tree
Hide file tree
Showing 62 changed files with 307 additions and 193 deletions.
22 changes: 11 additions & 11 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,18 @@ pipeline {
parallel {
stage('En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19'
}
}
stage('En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19'
}
}
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19'
}
}
}
Expand All @@ -152,8 +152,8 @@ pipeline {
parallel {
stage('L2: Eng TN') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --lang=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --lang=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_04-14.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
Expand All @@ -162,32 +162,32 @@ pipeline {

stage('L2: Eng ITN export') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --lang=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/inverse_text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --lang=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/inverse_text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
}
}
stage('L2: TN with Audio (audio and raw text)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --lang=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --text "The total amounts to \\$4.76." \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19 --text "The total amounts to \\$4.76." \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (audio and text file)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --lang=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (manifest)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --lang=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-19'
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def parse_args():
parser = ArgumentParser()
parser.add_argument("--input", help="input file path", required=True, type=str)
parser.add_argument(
"--lang", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str
"--language", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str
)
parser.add_argument("--output", help="output file path", required=True, type=str)
parser.add_argument("--verbose", help="print denormalization info. For debugging", action='store_true')
Expand All @@ -67,7 +67,7 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
file_path = args.input
inverse_normalizer = InverseNormalizer(lang=args.lang)
inverse_normalizer = InverseNormalizer(lang=args.language)

print("Loading data: " + file_path)
data = load_file(file_path)
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,129 +1,118 @@
f degree Fahrenheit
°f degree Fahrenheit
degree Fahrenheit
°F degree Fahrenheit
amu atomic mass unit
bar bar
°c degree Celsius
°C degree Celsius
degree Celsius
km kilometer
m meter
cm2 square centimeter
cm² square centimeter
cm3 cubic centimeter
cm³ cubic centimeter
cm centimeter
mm millimeter
ha hectare
mi mile
square meter
m2 square meter
km² square kilometer
km2 square kilometer
cwt hundredweight
db decibel
dm3 cubic decimeter
dm³ cubic decimeter
dm decimeter
ds decisecond
°f degree Fahrenheit
°F degree Fahrenheit
degree Fahrenheit
ft foot
% percent
ghz gigahertz
gw gigawatt
gwh gigawatt hour
hz hertz
kw kilowatt
kW kilowatt
hp horsepower
mg milligram
" inch
kbps kilobit per second
kcal kilo calory
kgf kilogram force
kg kilogram
ghz gigahertz
khz kilohertz
mhz megahertz
km2 square kilometer
km² square kilometer
km kilometer
kpa kilopascal
kwh kilowatt hour
kw kilowatt
kW kilowatt
lb pound
lbs pound
v volt
h hour
mc mega coulomb
s second
nm nanometer
rpm revolution per minute
min minute
mA milli ampere
kwh kilo watt hour
cubic meter
m2 square meter
square meter
m3 cubic meter
mph mile per hour
mv milli volt
mw megawatt
μm micrometer
" inch
tb terabyte
cc c c
g gram
da dalton
atm atmosphere
ω ohm
db decibel
ps peta second
oz ounce
hl hecto liter
μg microgram
pg petagram
gb gigabyte
MB megabyte
GB gigabyte
TB terabyte
PB petabyte
EB exabyte
ZB zettabyte
YB yottabyte
kb kilobit
ev electron volt
mb megabyte
kb kilobyte
kbps kilobit per second
cubic meter
mbps megabit per second
kl kilo liter
tj tera joule
kv kilo volt
mv mega volt
kn kilonewton
mm megameter
au astronomical unit
yd yard
rad radian
lm lumen
hs hecto second
mol mole
gpa giga pascal
mg milligram
mhz megahertz
mi2 square mile
mi² square mile
mi mile
min minute
ml milliliter
gw gigawatt
ma mega ampere
kt knot
kgf kilogram force
ng nano gram
mm2 square millimeter
mm² square millimeter
mol mole
mpa megapascal
mph mile per hour
ng nanogram
nm nanometer
ns nanosecond
ms mega siemens
bar bar
gl giga liter
μs microsecond
oz ounce
pa pascal
ds deci second
ms milli second
dm deci meter
dm³ cubic deci meter
dm3 cubic deci meter
amu atomic mass unit
mb megabit
mf mega farad
bq becquerel
pb petabit
mm² square millimeter
mm2 square millimeter
cm² square centimeter
cm2 square centimeter
cm³ cubic centimeter
cm3 cubic centimeter
sq mi square mile
mi² square mile
mi2 square mile
% percent
rad radian
rpm revolution per minute
sq ft square foot
kpa kilopascal
cd candela
tl tera liter
ms mega second
mpa megapascal
pb peta byte
gwh giga watt hour
kcal kilo calory
gy gray
sq mi square mile
sv sievert
cwt hundredweight
cc c c
tb terabyte
tj terajoule
tl teraliter
v volt
yd yard
μg microgram
μm micrometer
μs microsecond
ω ohm
atm ATM
au AU
bq BQ
cc CC
cd CD
da DA
eb EB
ev EV
f F
gb GB
g G
gl GL
gpa GPA
gy GY
ha HA
h H
hl HL
hp GP
hs HS
kb KB
kl KL
kn KN
kt KT
kv KV
lm LM
ma MA
mA MA
mb MB
mc MC
mf MF
m M
mm MM
ms MS
mv MV
mw MW
pb PB
pg PG
ps PS
s S
tb TB
tb YB
zb ZB
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
atm atmosphere
bq becquerel
cd candela
da dalton
eb exabyte
f degree Fahrenheit
gb gigabyte
g gram
gl gigaliter
ha hectare
h hour
hl hectoliter
hp horsepower
hp horsepower
kb kilobit
kb kilobyte
ma megaampere
mA megaampere
ma milliampere
mA milliampere
mb megabyte
mc megacoulomb
mf megafarad
m meter
m minute
mm millimeter
mm millimeter
mm millimeter
ms megasecond
ms mega siemens
ms millisecond
mv millivolt
mV millivolt
mw megawatt
mW megawatt
pb petabyte
pg petagram
ps petasecond
s second
tb terabyte
tb terabyte
yb yottabyte
zb zettabyte
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ CLASS
PART
Part
part
article
Article
Section
section
Loading

0 comments on commit ea201d2

Please sign in to comment.