In [None]:
"""
You can run either this notebook locally or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Optional: Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""

> **_NOTE:_** Find the official NeMo documentation at 
https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/intro.html 

# Overview
<img src="https://raw.githubusercontent.com/NVIDIA/NeMo/main/tutorials/text_processing/images/task_overview.png" width="600"/>

A sentence can be split up into semiotic tokens stemming from a variety of classes, where the spoken form differs from the written form. Examples are *dates*, *decimals*, *cardinals*, *measures* etc. The good TN or ITN system will be able to handle a variety of **semiotic classes**.

# How to use
## 1. Installation

In [None]:
## Install NeMo, which installs both nemo and nemo_text_processing package
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]

# install Pynini for text normalization
! wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh
! bash install_pynini.sh

In [None]:
# try to import of nemo_text_processing an other dependencies
import nemo_text_processing
import os

## 2. Text Normalization

In [None]:
# create text normalization instance that works on cased input
from nemo_text_processing.text_normalization.normalize import Normalizer
normalizer = Normalizer(input_case='cased', lang='en')

In [None]:
# the normalizer class offers the following parameterization. 
print(normalizer.__doc__)

> **_NOTE:_** Standard Text Normalization uses `deterministic=True`, outputting a single output for a given input string



### 2.1 Run TN on input string

In [None]:
# Normalizer.normalize() offers the following parameterization
print(normalizer.normalize.__doc__)

In [None]:
# run normalization on example string input
written = "We paid $123 for this desk."
normalized = normalizer.normalize(written, verbose=True, punct_post_process=True)
print(normalized)

intermediate semiotic class information is shown if verbose=True. 

Long input text could be split into sentences as follows:

In [None]:
written = "Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk."

# split long text into sentences
sentences = normalizer.split_text_into_sentences(written)

for sent in sentences:
    print(sent)

# normalize each sentence separately using normalize() or all sentences at once with normalize_list()
normalizer.normalize_list(sentences)


### 2.2 Run TN on list of input strings

In [None]:
# create temporary data folder and example input file
DATA_DIR = 'tmp_data_dir'
os.makedirs(DATA_DIR, exist_ok=True)
INPUT_FILE = f'{DATA_DIR}/inference.txt'
! echo -e 'The alarm went off at 10:00a.m. \nI received $123' > $INPUT_FILE

In [None]:
# check input file was properly created
! cat $INPUT_FILE

In [None]:
# load input file into 'data' - a list of strings
data = []
with open(INPUT_FILE, 'r') as fp:
    for line in fp:
        data.append(line.strip())
data

In [None]:
# run normalization on 'data'
normalizer.normalize_list(data, punct_post_process=True)

### 2.3 Evaluate TN on written-normalized text pairs 

The evaluation data needs to have the following format:

'on 22 july 2022 they worked until 12:00' and the normalization is represented as 

In [None]:
# example evaluation sentence
eval_text =  """PLAIN\ton\t<self>
DATE\t22 july 2012\tthe twenty second of july twenty twelve
PLAIN\tthey\t<self>
PLAIN\tworked\t<self>
PLAIN\tuntil\t<self>
TIME\t12:00\ttwelve o'clock
<eos>\t<eos>
"""
EVAL_FILE = f'{DATA_DIR}/eval.txt'
with open(EVAL_FILE, 'w') as fp:
    fp.write(eval_text)
! cat $EVAL_FILE

That is, every sentence is broken into semiotic tokens line by line and concluded by end of sentence token `<eos>`. In case of a plain token it's `[SEMIOTIC CLASS] [TAB] [WRITTEN] [TAB] <self>`, otherwise `[SEMIOTIC CLASS] [TAB] [WRITTEN] [TAB] [NORMALIZED]`.
This format was introduced in [Google Text normalization dataset](https://arxiv.org/abs/1611.00068). 

In [None]:
# Parse evaluation file into written and normalized sentence pairs
from nemo_text_processing.text_normalization.data_loader_utils import load_files, training_data_to_sentences
eval_data = load_files([EVAL_FILE])
sentences_un_normalized, sentences_normalized, sentences_class_types = training_data_to_sentences(eval_data)
print(list(zip(sentences_un_normalized, sentences_normalized)))

In [None]:
# run prediction
sentences_prediction = normalizer.normalize_list(sentences_un_normalized)
print(sentences_prediction)

In [None]:
# measure sentence accuracy
from nemo_text_processing.text_normalization.data_loader_utils import evaluate
sentences_accuracy = evaluate(
            preds=sentences_prediction, labels=sentences_normalized, input=sentences_un_normalized
        )
print("- Accuracy: " + str(sentences_accuracy))

## 3. Inverse Text Normalization
ITN supports equivalent API as TN. Here we are only going to show inverse normalization on input string

In [None]:
# create inverse text normalization instance
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
inverse_normalizer = InverseNormalizer(lang='en')

In [None]:
# run ITN on example string input
spoken = "we paid one hundred twenty three dollars for this desk"
un_normalized = inverse_normalizer.inverse_normalize(spoken, verbose=True)
print(un_normalized)

### 4. Audio-based Text Normalization
Audio-based text normalization uses extended [WFST](https://en.wikipedia.org/wiki/Finite-state_machine) grammars to provide a range of possible normalization options.
The following example shows the workflow: (Disclaimer: exact values in graphic do not need to be real system's behavior)
1. text "627" is sent to extended TN WFST grammar
2. grammar output 5 different options of verbalization based on text input alone
3. in case an audio file is presented we compare the audio transcript with the verbalization options to find out which normalization is correct based on character error rate. The transcript is generated using a pretrained NeMo ASR model. 


<img src="https://raw.githubusercontent.com/NVIDIA/NeMo/main/tutorials/text_processing/images/audio_based_tn.png" width="600"/>

The following shows an example of how to generate multiple normalization options:

In [None]:
# import non-deterministic WFST-based TN module
from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio

In [None]:
# initialize normalizer, this may take some time to generate the extended grammars. 
# Thus, we recommend to cache the grammars by specifying a cache directory
normalizer = NormalizerWithAudio(
        lang="en",
        input_case="cased",
        overwrite_cache=False,
        cache_dir="cache_dir",
    )
# create up to 10 normalization options
print(normalizer.normalize("123", n_tagged=10, punct_post_process=True))

## 5. Parallel execution

`Normalizer.normalize()` as well as `InverseNormalizer.inverse_normalize()` are functions without side effect.
Thus, if you need to normalize large amounts of input examples, these can be executed in parallel.

# Tutorial on how to customize grammars

https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb


# References and Further Reading:


- [Zhang, Yang, Bakhturina, Evelina, Gorman, Kyle and Ginsburg, Boris. "NeMo Inverse Text Normalization: From Development To Production." (2021)](https://arxiv.org/abs/2104.05055)
- [Ebden, Peter, and Richard Sproat. "The Kestrel TTS text normalization system." Natural Language Engineering 21.3 (2015): 333.](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)
- [Gorman, Kyle. "Pynini: A Python library for weighted finite-state grammar compilation." Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. 2016.](https://www.aclweb.org/anthology/W16-2409.pdf)
- [Mohri, Mehryar, Fernando Pereira, and Michael Riley. "Weighted finite-state transducers in speech recognition." Computer Speech & Language 16.1 (2002): 69-88.](https://cs.nyu.edu/~mohri/postscript/csl01.pdf)