Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def parse_args():
"--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str
)
parser.add_argument(
"--language", help="Select target language", choices=["en", "ru", "de", "es"], default="en", type=str
"--language", help="Select target language", choices=["en", "ru", "de", "es", "sv"], default="en", type=str
)
parser.add_argument("--manifest", default=None, help="path to .json manifest")
parser.add_argument(
Expand Down
39 changes: 26 additions & 13 deletions nemo_text_processing/text_normalization/sv/taggers/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,12 @@ def make_million(number: str, non_zero_no_one: 'pynini.FstLike', deterministic:
for one in ["en", "ett"]:
graph |= pynutil.add_weight(pynini.cross("001", f"{one} {number}"), -0.001)
graph |= pynutil.add_weight(pynini.cross("001", f"{one} {old_orth}"), -0.001)
graph |= pynutil.add_weight(pynini.cross("001", f"{one}{number}"), -0.001)
graph |= pynutil.add_weight(pynini.cross("001", f"{one}{old_orth}"), -0.001)
graph |= non_zero_no_one + pynutil.insert(f" {number}er")
if not deterministic:
graph |= pynutil.add_weight(non_zero_no_one + pynutil.insert(f" {old_orth}er"), -0.001)
graph |= pynutil.add_weight(non_zero_no_one + pynutil.insert(f"{old_orth}er"), -0.001)
graph |= pynutil.delete("000")
graph += insert_space
return graph
Expand Down Expand Up @@ -100,17 +103,17 @@ def __init__(self, deterministic: bool = True):
digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv")))
ties = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv")))
ett_to_en = pynini.string_map([("ett", "en")])
ett_to_en = pynini.cross("ett", "en")
ties_alt_endings = pynini.string_map([("go", "gi"), ("tio", "ti")])

# Any single digit
graph_digit = digit
digits_no_one = (NEMO_DIGIT - "1") @ graph_digit
both_ones = pynini.cross("1", "en") | pynini.cross("1", "ett")
if deterministic:
final_digit = digit
else:
final_digit = digits_no_one | both_ones
final_digit = digit | pynini.cross("1", "en")
graph_digit = final_digit
self.digit = final_digit

single_digits_graph = graph_digit | zero
Expand All @@ -131,14 +134,13 @@ def __init__(self, deterministic: bool = True):
else:
graph_tens |= pynutil.add_weight(pynini.cross("18", "aderton"), -0.001)
graph_tens |= pynutil.add_weight(
graph_ties + (pynutil.delete('0') | (graph_digit | pynutil.insert(' ') + graph_digit)), -0.001
graph_ties + (pynutil.delete('0') | (graph_digit | insert_space + graph_digit)), -0.001
)

hundreds = digits_no_one + pynutil.insert("hundra")
hundreds |= pynini.cross("1", "hundra")
if not deterministic:
hundreds |= pynutil.add_weight(pynini.cross("1", "etthundra"), -0.001)
hundreds |= pynutil.add_weight(pynini.cross("1", "ett hundra"), -0.001)
hundreds |= pynutil.add_weight(digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra"), -0.001)

self.tens = graph_tens.optimize()
Expand Down Expand Up @@ -180,6 +182,7 @@ def __init__(self, deterministic: bool = True):
graph_hundreds_component_at_least_one_non_zero_digit = graph_hundreds_component | (
pynutil.delete("00") + graph_digit
)

graph_hundreds_component_at_least_one_non_zero_digit_no_one = graph_hundreds_component | (
pynutil.delete("00") + digits_no_one
)
Expand All @@ -192,18 +195,21 @@ def __init__(self, deterministic: bool = True):
if not deterministic:
tusen |= pynutil.add_weight(pynutil.insert(" tusen"), -0.001)
etttusen = tusen
etttusen |= pynutil.add_weight(pynutil.insert("ettusen"), -0.001)
etttusen |= pynutil.add_weight(pynutil.insert(" ettusen"), -0.001)
etttusen |= pynutil.add_weight(pynutil.insert("etttusen"), -0.001)
etttusen |= pynutil.add_weight(pynutil.insert(" etttusen"), -0.001)
etttusen |= pynutil.add_weight(pynutil.insert("ett tusen"), -0.001)
etttusen |= pynutil.add_weight(pynutil.insert(" ett tusen"), -0.001)

following_hundred = insert_space + graph_hundreds_component_at_least_one_non_zero_digit
if not deterministic:
following_hundred |= graph_hundreds_component_at_least_one_non_zero_digit

graph_thousands_component_at_least_one_non_zero_digit = pynini.union(
pynutil.delete("000") + graph_hundreds_component_at_least_one_non_zero_digit,
graph_hundreds_component_at_least_one_non_zero_digit_no_one
+ tusen
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
pynini.cross("001", etttusen)
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
+ (following_hundred | pynutil.delete("000")),
pynini.cross("001", etttusen) + (following_hundred | pynutil.delete("000")),
)
self.graph_thousands_component_at_least_one_non_zero_digit = (
graph_thousands_component_at_least_one_non_zero_digit.optimize()
Expand All @@ -213,9 +219,8 @@ def __init__(self, deterministic: bool = True):
pynutil.delete("000") + graph_hundreds_component_at_least_one_non_zero_digit_no_one,
graph_hundreds_component_at_least_one_non_zero_digit_no_one
+ tusen
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
pynini.cross("001", etttusen)
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
+ (following_hundred | pynutil.delete("000")),
pynini.cross("001", etttusen) + (following_hundred | pynutil.delete("000")),
)
self.graph_thousands_component_at_least_one_non_zero_digit_no_one = (
graph_thousands_component_at_least_one_non_zero_digit_no_one.optimize()
Expand Down Expand Up @@ -326,11 +331,19 @@ def __init__(self, deterministic: bool = True):

self.graph |= zero

self.graph_unfiltered = self.graph
self.graph = filter_punctuation(self.graph).optimize()
self.graph_en = self.graph @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)
self.graph_no_one = (pynini.project(self.graph, "input") - "1") @ self.graph
self.graph_no_one_en = (pynini.project(self.graph_en, "input") - "1") @ self.graph_en

joiner_chars = pynini.union("-", "–", "—")
joiner = pynini.cross(joiner_chars, " till ")
self.range = self.graph + joiner + self.graph
if not deterministic:
either_one = self.graph | self.graph_en
self.range = either_one + joiner + either_one

optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)

final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
Expand Down
18 changes: 15 additions & 3 deletions nemo_text_processing/text_normalization/sv/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,28 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool):

# prefer cardinal over year
year_first = ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 1)) @ numbers
year_second = (
pynini.union((NEMO_DIGIT - "0") + (NEMO_DIGIT - "0"), "0" + (NEMO_DIGIT - "0"), (NEMO_DIGIT - "0") + "0")
@ numbers
year_second = pynini.union(
((NEMO_DIGIT - "0") + (NEMO_DIGIT - "0")) @ numbers,
pynini.cross("0", "hundra") + ((NEMO_DIGIT - "0") @ numbers),
((NEMO_DIGIT - "0") + "0") @ numbers,
)
year_hundra = year_first + pynutil.insert("hundra") + year_second
year_hundra |= year_first + pynutil.insert(" hundra") + year_second
year_hundra |= year_first + pynutil.insert(" hundra ") + year_second
year_hundra |= year_first + pynutil.insert("hundra ") + year_second
year_second |= pynini.cross("00", "hundra")
year_cardinal = ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)) @ numbers
year = pynini.union(year_first + year_second, year_first) # 90, 990, 1990
if not deterministic:
year |= year_cardinal
year |= year_hundra
self.year = year
self.year_cardinal = year_cardinal
sou_number = self.year + pynini.cross(":", " kolon ") + numbers
sou_word = pynini.accep("SOU")
if not deterministic:
sou_word |= pynini.cross("SOU", "statens offentliga utredningar")
self.sou = sou_word + NEMO_SPACE + sou_number

year_second_decades = ((NEMO_DIGIT - "0") + "0") @ numbers
year_second_decades |= pynini.cross("00", "hundra")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,13 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):

self.suffixed_to_words = self.suffixed_ordinal @ self.graph

self.bare_ordinals = cleaned_graph
kapitlet_word = pynini.union("kapitlet", pynini.cross("kap", "kapitlet"))
kapitlet = cleaned_graph + NEMO_SPACE + kapitlet_word

tok_graph = (
pynutil.insert("integer: \"")
+ (cleaned_graph + pynutil.delete(".") | self.suffixed_to_words)
+ (cleaned_graph + pynutil.delete(".") | self.suffixed_to_words | kapitlet)
+ pynutil.insert("\"")
)

Expand Down
Loading