Skip to content

Commit

Permalink
add and for cardinal part
Browse files Browse the repository at this point in the history
Signed-off-by: ekmb <ebakhturina@nvidia.com>
  • Loading branch information
ekmb committed Apr 14, 2022
1 parent caa4fbd commit 8c09dfe
Show file tree
Hide file tree
Showing 11 changed files with 67 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
Expand Down Expand Up @@ -130,6 +131,7 @@ def __init__(
whitelist = WhiteListFst(input_case=input_case, deterministic=False, input_file=whitelist)
whitelist_graph = whitelist.graph
punct_graph = PunctuationFst(deterministic=True).graph
serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic, lm=True).fst

# VERBALIZERS
cardinal = vCardinal(deterministic=True)
Expand Down Expand Up @@ -168,6 +170,9 @@ def __init__(
| pynutil.add_weight(pynini.compose(money_graph, v_money_graph), sem_w)
| pynutil.add_weight(cardinal_or_date_final, sem_w)
| pynutil.add_weight(whitelist_graph, sem_w)
| pynutil.add_weight(
pynini.compose(serial_graph, v_cardinal_graph), 1.1001
) # should be higher than the rest of the classes
).optimize()

roman_graph = RomanFst(deterministic=deterministic, lm=True).fst
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
Expand Down Expand Up @@ -128,6 +129,7 @@ def __init__(
whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
whitelist_graph = whitelist.graph
punct_graph = PunctuationFst(deterministic=deterministic).graph
serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst

# VERBALIZERS
cardinal = vCardinal(deterministic=deterministic)
Expand Down Expand Up @@ -174,6 +176,9 @@ def __init__(
| pynutil.add_weight(word_graph, word_w)
| pynutil.add_weight(pynini.compose(date_graph, v_date_graph), sem_w - 0.01)
| pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w)
| pynutil.add_weight(
pynini.compose(serial_graph, v_cardinal_graph), 1.1001
) # should be higher than the rest of the classes
).optimize()

if not deterministic:
Expand Down
29 changes: 24 additions & 5 deletions nemo_text_processing/text_normalization/en/verbalizers/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,33 @@ def __init__(self, deterministic: bool = True):

self.optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "minus ") + delete_space, 0, 1)

if deterministic:
integer = pynini.closure(NEMO_NOT_QUOTE, 1)
else:
integer = (
no_thousand_million = pynini.difference(
pynini.closure(NEMO_NOT_QUOTE),
pynini.closure(NEMO_NOT_QUOTE) + pynini.union("thousand", "million") + pynini.closure(NEMO_NOT_QUOTE),
).optimize()
integer = (
pynini.closure(NEMO_NOT_QUOTE)
+ pynini.closure(
pynutil.add_weight(pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001), 0, 1
).optimize()
)
no_hundred = pynini.difference(
pynini.closure(NEMO_NOT_QUOTE),
pynini.closure(NEMO_NOT_QUOTE) + "hundred" + pynini.closure(NEMO_NOT_QUOTE),
).optimize()
integer |= (
pynini.closure(NEMO_NOT_QUOTE)
+ pynini.closure(
pynutil.add_weight(pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001), 0, 1
).optimize()
)

if not deterministic:
integer |= (
pynini.closure(NEMO_NOT_QUOTE)
+ pynini.closure(pynini.cross("hundred ", "hundred and ") | pynini.cross("hundred ", " "), 0, 1)
+ pynini.closure(NEMO_NOT_QUOTE)
)
).optimize()

self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"")
integer = pynutil.delete("integer:") + self.integer
Expand Down
13 changes: 1 addition & 12 deletions nemo_text_processing/text_normalization/en/verbalizers/money.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,7 @@ def __init__(self, decimal: GraphFst, deterministic: bool = True):
pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
)

integer_part = (
pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
)

if not deterministic:
integer_part |= (
pynutil.delete("integer_part: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynini.cross("hundred ", "hundred and ")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
integer_part = decimal.integer

optional_add_and = pynini.closure(pynutil.insert("and "), 0, 1)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
2~two
-2~minus two
3~three
123~one hundred twenty three
123~one hundred and twenty three
13,000~thirteen thousand
13000~one three zero zero zero
123,123,000~one hundred twenty three million one hundred twenty three thousand
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
3 mbps~three megabits per second
3 cc/s~three c c per second
100 million kg~one hundred million kilograms
a 123.2-millimeters long~a one hundred twenty three point two millimeters long
a 123.2-millimeters long~a one hundred and twenty three point two millimeters long
covid-19.5~covid- nineteen point five
7.2-millimeter bullet~seven point two millimeter bullet
4 1/2 lbs~four and a half pounds
4.4x~four point four x
80 s~eighty seconds
1234-123kg~one thousand two hundred thirty four to one hundred twenty three kilograms
1234-123kg~one thousand two hundred thirty four to one hundred and twenty three kilograms
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$2~two dollars
$2.00~two dollars
₩460 billion~four hundred sixty billion won
₩460 billion~four hundred and sixty billion won
¥30 billion~thirty billion yen
$45 billion~forty five billion dollars
$0.2 million~zero point two million dollars
Expand All @@ -27,28 +27,28 @@ $.506~point five zero six dollars
$18~eighteen dollars
$18925000~eighteen million nine hundred twenty five thousand dollars
$18,925,000~eighteen million nine hundred twenty five thousand dollars
$18854~eighteen thousand eight hundred fifty four dollars
$18801~eighteen thousand eight hundred one dollars
$18875~eighteen thousand eight hundred seventy five dollars
$18081~eighteen thousand eighty one dollars
$18052~eighteen thousand fifty two dollars
$18542~eighteen thousand five hundred forty two dollars
$18519~eighteen thousand five hundred nineteen dollars
$18570~eighteen thousand five hundred seventy dollars
$18578~eighteen thousand five hundred seventy eight dollars
$18516~eighteen thousand five hundred sixteen dollars
$18482~eighteen thousand four hundred eighty two dollars
$18478~eighteen thousand four hundred seventy eight dollars
$18468~eighteen thousand four hundred sixty eight dollars
$18903~eighteen thousand nine hundred three dollars
$18929~eighteen thousand nine hundred twenty nine dollars
$18095~eighteen thousand ninety five dollars
$18117~eighteen thousand one hundred seventeen dollars
$18128~eighteen thousand one hundred twenty eight dollars
$18125~eighteen thousand one hundred twenty five dollars
$18124~eighteen thousand one hundred twenty four dollars
$18129~eighteen thousand one hundred twenty nine dollars
$18854~eighteen thousand eight hundred and fifty four dollars
$18801~eighteen thousand eight hundred and one dollars
$18875~eighteen thousand eight hundred and seventy five dollars
$18081~eighteen thousand and eighty one dollars
$18052~eighteen thousand and fifty two dollars
$18542~eighteen thousand five hundred and forty two dollars
$18519~eighteen thousand five hundred and nineteen dollars
$18570~eighteen thousand five hundred and seventy dollars
$18578~eighteen thousand five hundred and seventy eight dollars
$18516~eighteen thousand five hundred and sixteen dollars
$18482~eighteen thousand four hundred and eighty two dollars
$18478~eighteen thousand four hundred and seventy eight dollars
$18468~eighteen thousand four hundred and sixty eight dollars
$18903~eighteen thousand nine hundred and three dollars
$18929~eighteen thousand nine hundred and twenty nine dollars
$18095~eighteen thousand and ninety five dollars
$18117~eighteen thousand one hundred and seventeen dollars
$18128~eighteen thousand one hundred and twenty eight dollars
$18125~eighteen thousand one hundred and twenty five dollars
$18124~eighteen thousand one hundred and twenty four dollars
$18129~eighteen thousand one hundred and twenty nine dollars
£18000~eighteen thousand pounds
$50.00-$100.00~fifty dollars - one hundred dollars
$1,925.21~one thousand nine hundred twenty five dollars twenty one cents
$1,234.123~one thousand two hundred thirty four point one two three dollars
$1,925.21~one thousand nine hundred and twenty five dollars twenty one cents
$1,234.123~one thousand two hundred and thirty four point one two three dollars
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ from 2-10~from two to ten
2-5lb~two to five pounds
2x8 m2~two by eight square meters
170 - 45190~one hundred seventy to forty five thousand one hundred ninety
1980-1970 kg~one thousand nine hundred eighty to one thousand nine hundred seventy kilograms
1980-1970 kg~one thousand nine hundred eighty to one thousand nine hundred and seventy kilograms
5pm-7 pm~five PM to seven PM
4:00 am - 8:12 pm~four AM to eight twelve PM
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ MIG-25/235212-asdg~MIG-twenty five/two three five two one two-asdg
2x~two x
11:30EST~eleven : thirty EST
31/31/100~thirty one/thirty one/one hundred
1-8090~one - eight thousand ninety
1-8090~one - eight thousand and ninety
f++~f plus plus
dfsd#sfgsfd$~dfsd hash sfgsfd dollar
dfsd$$sfgsfd$~dfsd dollar dollar sfgsfd dollar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ go <noise/> home~go <noise/> home
go<noise/>home~go <noise/> home
<pausing/>~<pausing/>
<cough/>~<cough/>
<pitch=-.3>123?123</pitch>~<pitch=-.3> one hundred twenty three ? one hundred twenty three </pitch>
<pitch=-.3>123?123</pitch>~<pitch=-.3> one hundred and twenty three ? one hundred and twenty three </pitch>
<singing> haleluja </singing>~<singing> haleluja </singing>
4 changes: 2 additions & 2 deletions tests/nemo_text_processing/en/test_money.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class TestMoney:
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False)
assert pred == expected
assert pred == expected, f"input: {test_input}"

normalizer_en = (
Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False)
Expand All @@ -56,7 +56,7 @@ def test_denorm(self, test_input, expected):
@pytest.mark.unit
def test_norm(self, test_input, expected):
pred = self.normalizer_en.normalize(test_input, verbose=False)
assert pred == expected
assert pred == expected, f"input: {test_input}"

if self.normalizer_with_audio_en:
pred_non_deterministic = self.normalizer_with_audio_en.normalize(
Expand Down

0 comments on commit 8c09dfe

Please sign in to comment.