Skip to content

Commit

Permalink
Add ITN pt (#4516)
Browse files Browse the repository at this point in the history
* Add ITN pt

Signed-off-by: Guilherme Steinmann <guist@linse.ufsc.br>

* Fix style

Signed-off-by: Guilherme Steinmann <guist@linse.ufsc.br>

* Fix style

Signed-off-by: Guilherme Steinmann <guist@linse.ufsc.br>

* Update copyright year to 2022 on ITN pt rules and tests

Signed-off-by: Guilherme Steinmann <guist@linse.ufsc.br>
  • Loading branch information
guidefloripa committed Jul 7, 2022
1 parent 1f97094 commit 2089016
Show file tree
Hide file tree
Showing 87 changed files with 3,386 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

class InverseNormalizer(Normalizer):
"""
Inverse normalizer that converts text from spoken to written form. Useful for ASR postprocessing.
Inverse normalizer that converts text from spoken to written form. Useful for ASR postprocessing.
Input is expected to have no punctuation outside of approstrophe (') and dash (-) and be lower cased.
Args:
Expand All @@ -46,6 +46,12 @@ def __init__(self, lang: str = 'en', cache_dir: str = None, overwrite_cache: boo
VerbalizeFinalFst,
)

elif lang == 'pt':
from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)

elif lang == 'ru':
from nemo_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
Expand Down Expand Up @@ -75,7 +81,7 @@ def __init__(self, lang: str = 'en', cache_dir: str = None, overwrite_cache: boo

def inverse_normalize_list(self, texts: List[str], verbose=False) -> List[str]:
"""
NeMo inverse text normalizer
NeMo inverse text normalizer
Args:
texts: list of input strings
Expand Down Expand Up @@ -106,7 +112,7 @@ def parse_args():
input.add_argument("--input_file", dest="input_file", help="input file path", type=str)
parser.add_argument('--output_file', dest="output_file", help="output file path", type=str)
parser.add_argument(
"--language", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str
"--language", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi'], default="en", type=str
)
parser.add_argument("--verbose", help="print info for debugging", action='store_true')
parser.add_argument("--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true")
Expand Down
17 changes: 17 additions & 0 deletions nemo_text_processing/inverse_text_normalization/pt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize import VerbalizeFst
from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import VerbalizeFinalFst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
euros
£ libras esterlinas
US$ dólares americanos
$ dólares
R$ reais
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
euro
£ libra esterlina
US$ dólar americano
$ dólar
R$ real
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
com
es
uk
fr
net
br
in
ru
de
it
edu
co
ar
bo
cl
co
ec
fk
gf
fy
pe
py
sr
ve
uy
pt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
gmail g mail
gmail
nvidia n vidia
nvidia
outlook
hotmail
yahoo
aol
live
msn
live
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
. ponto
- traço
- hífen
_ traço baixo
_ underscore
/ barra
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
h horas
min minutos
s segundos
ms milissegundos
ns nanossegundos
μs microssegundos
t toneladas
kg quilos
kg quilogramas
g gramas
mg miligramas
μm micrômetros
nm nanômetros
mm milímetros
cm centímetros
cm² centímetros quadrado
cm³ centímetros cúbico
m metros
metros quadrados
metros cúbicos
km quilômetros
km² quilômetros quadrados
ha hectares
kph quilômetros por hora
mph milhas por hora
m/s metros por segundo
l litros
ml mililitros
kgf quilogramas forças
kgf quilogramas força
% por cento
°F fahrenheit
°C celsius
°F graus fahrenheit
°C graus celsius
Hz hertz
kHz quilo hertz
MHz mega hertz
GHz giga hertz
W watts
kW quilowatts
MW megawatts
GW gigawatts
Wh watts hora
kWh quilowatts hora
MWh megawatts hora
GWh gigawatts hora
kV quilovolts
V volts
mV milivolts
A amperes
mA miliamperes
rpm rotações por minuto
db decibéis
cal calorias
kcal quilocalorias
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
h hora
min minuto
s segundo
ms milissegundo
ns nanossegundo
μs microssegundo
t tonelada
kg quilo
kg quilograma
g grama
mg miligrama
μm micrômetro
nm nanômetro
mm milímetro
cm centímetro
cm² centímetro quadrado
cm³ centímetro cúbico
m metro
metro quadrado
metro cúbico
km quilômetro
km² quilômetro quadrado
ha hectare
kph quilômetro por hora
mph milha por hora
m/s metro por segundo
l litro
ml mililitro
kgf quilograma força
% por cento
°F fahrenheit
°C celsius
°F grau fahrenheit
°C grau celsius
Hz hertz
kHz quilo hertz
MHz mega hertz
GHz giga hertz
W watt
kW quilowatt
MW megawatt
GW gigawatt
Wh watt hora
kWh quilowatt hora
MWh megawatt hora
GWh gigawatt hora
kV quilovolt
V volt
mV milivolt
A ampere
mA miliampere
rpm rotação por minuto
db decibel
cal caloria
kcal quilocaloria
12 changes: 12 additions & 0 deletions nemo_text_processing/inverse_text_normalization/pt/data/months.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
janeiro
fevereiro
março
abril
maio
junho
julho
agosto
setembro
outubro
novembro
dezembro
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
um 1
uma 1
dois 2
duas 2
três 3
quatro 4
cinco 5
seis 6
sete 7
oito 8
nove 9
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
cento 1
duzentos 2
duzentas 2
trezentos 3
trezentas 3
quatrocentos 4
quatrocentas 4
quinhentos 5
quinhentas 5
seiscentos 6
seiscentas 6
setecentos 7
setecentas 7
oitocentos 8
oitocentas 8
novecentos 9
novecentas 9
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cem 100
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
dez 10
onze 11
doze 12
treze 13
catorze 14
quatorze 14
quinze 15
dezesseis 16
dezessete 17
dezoito 18
dezenove 19
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
vinte 2
trinta 3
quarenta 4
cinquenta 5
sessenta 6
setenta 7
oitenta 8
noventa 9
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
vinte um 21
vinte dois 22
vinte três 23
vinte quatro 24
vinte cinco 25
vinte seis 26
vinte sete 27
vinte oito 28
vinte nove 29
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
zero 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
primeiro 1
primeira 1
segundo 2
segunda 2
terceiro 3
terceira 3
quarto 4
quarta 4
quinto 5
quinta 5
sexto 6
sexta 6
sétimo 7
sétima 7
oitavo 8
oitava 8
nono 9
nona 9
Loading

0 comments on commit 2089016

Please sign in to comment.